Merge branch 'next' into for-linus

author: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2009-09-14 00:16:56 -0400
committer: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2009-09-14 00:16:56 -0400
commit: fc8e1ead9314cf0e0f1922e661428b93d3a50d88 (patch)
tree: f3cb97c4769b74f6627a59769f1ed5c92a13c58a /fs
parent: 2bcaa6a4238094c5695d5b1943078388d82d3004 (diff)
parent: 9de48cc300fb10f7d9faa978670becf5e352462a (diff)
552 files changed, 32772 insertions, 16710 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 332b5ff02fec..f7003cfac63d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -76,7 +76,7 @@ static const match_table_t tokens = {
 * Return 0 upon success, -ERRNO upon failure.
 */
-static int v9fs_parse_options(struct v9fs_session_info *v9ses)
+static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 {
        char *options;
        substring_t args[MAX_OPT_ARGS];
@@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
        v9ses->debug = 0;
        v9ses->cache = 0;
-        if (!v9ses->options)
+        if (!opts)
                return 0;
-        options = kstrdup(v9ses->options, GFP_KERNEL);
+        options = kstrdup(opts, GFP_KERNEL);
        if (!options) {
                P9_DPRINTK(P9_DEBUG_ERROR,
                           "failed to allocate copy of option string\n");
@@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        v9ses->uid = ~0;
        v9ses->dfltuid = V9FS_DEFUID;
        v9ses->dfltgid = V9FS_DEFGID;
-        if (data) {
-                v9ses->options = kstrdup(data, GFP_KERNEL);
-                if (!v9ses->options) {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                           "failed to allocate copy of option string\n");
-                        retval = -ENOMEM;
-                        goto error;
-                }
-        }
-        rc = v9fs_parse_options(v9ses);
+        rc = v9fs_parse_options(v9ses, data);
        if (rc < 0) {
                retval = rc;
                goto error;
        }
-        v9ses->clnt = p9_client_create(dev_name, v9ses->options);
+        v9ses->clnt = p9_client_create(dev_name, data);
        if (IS_ERR(v9ses->clnt)) {
                retval = PTR_ERR(v9ses->clnt);
                v9ses->clnt = NULL;
@@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
        __putname(v9ses->uname);
        __putname(v9ses->aname);
-        kfree(v9ses->options);
 }
 /**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a7d567192998..38762bf102a9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -85,7 +85,6 @@ struct v9fs_session_info {
        unsigned int afid;
        unsigned int cache;
-        char *options;          /* copy of mount options */
        char *uname;            /* user name to mount as */
        char *aname;            /* name of remote hierarchy being mounted */
        unsigned int maxdata;   /* max data for client interface */
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6fcb1e7095cf..92828281a30b 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -57,7 +57,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
        buffer = kmap(page);
        offset = page_offset(page);
-        retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE);
+        retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
        if (retval < 0)
                goto done;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 81f8bbf12f9f..06a223d50a81 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended)
 /**
 * v9fs_blank_wstat - helper function to setup a 9P stat structure
- * @v9ses: 9P session info (for determining extended mode)
 * @wstat: structure to initialize
 *
 */
@@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
 struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 {
+        int err;
        struct inode *inode;
        struct v9fs_session_info *v9ses = sb->s_fs_info;
        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
        inode = new_inode(sb);
-        if (inode) {
+        if (!inode) {
-                inode->i_mode = mode;
-                inode->i_uid = current_fsuid();
-                inode->i_gid = current_fsgid();
-                inode->i_blocks = 0;
-                inode->i_rdev = 0;
-                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                inode->i_mapping->a_ops = &v9fs_addr_operations;
-                switch (mode & S_IFMT) {
-                case S_IFIFO:
-                case S_IFBLK:
-                case S_IFCHR:
-                case S_IFSOCK:
-                        if (!v9fs_extended(v9ses)) {
-                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                      "special files without extended mode\n");
-                                return ERR_PTR(-EINVAL);
-                        }
-                        init_special_inode(inode, inode->i_mode,
-                                           inode->i_rdev);
-                        break;
-                case S_IFREG:
-                        inode->i_op = &v9fs_file_inode_operations;
-                        inode->i_fop = &v9fs_file_operations;
-                        break;
-                case S_IFLNK:
-                        if (!v9fs_extended(v9ses)) {
-                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "extended modes used w/o 9P2000.u\n");
-                                return ERR_PTR(-EINVAL);
-                        }
-                        inode->i_op = &v9fs_symlink_inode_operations;
-                        break;
-                case S_IFDIR:
-                        inc_nlink(inode);
-                        if (v9fs_extended(v9ses))
-                                inode->i_op = &v9fs_dir_inode_operations_ext;
-                        else
-                                inode->i_op = &v9fs_dir_inode_operations;
-                        inode->i_fop = &v9fs_dir_operations;
-                        break;
-                default:
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                                "BAD mode 0x%x S_IFMT 0x%x\n",
-                                mode, mode & S_IFMT);
-                        return ERR_PTR(-EINVAL);
-                }
-        } else {
                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
                return ERR_PTR(-ENOMEM);
        }
+        inode->i_mode = mode;
+        inode->i_uid = current_fsuid();
+        inode->i_gid = current_fsgid();
+        inode->i_blocks = 0;
+        inode->i_rdev = 0;
+        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        inode->i_mapping->a_ops = &v9fs_addr_operations;
+        switch (mode & S_IFMT) {
+        case S_IFIFO:
+        case S_IFBLK:
+        case S_IFCHR:
+        case S_IFSOCK:
+                if (!v9fs_extended(v9ses)) {
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                   "special files without extended mode\n");
+                        err = -EINVAL;
+                        goto error;
+                }
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                break;
+        case S_IFREG:
+                inode->i_op = &v9fs_file_inode_operations;
+                inode->i_fop = &v9fs_file_operations;
+                break;
+        case S_IFLNK:
+                if (!v9fs_extended(v9ses)) {
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                   "extended modes used w/o 9P2000.u\n");
+                        err = -EINVAL;
+                        goto error;
+                }
+                inode->i_op = &v9fs_symlink_inode_operations;
+                break;
+        case S_IFDIR:
+                inc_nlink(inode);
+                if (v9fs_extended(v9ses))
+                        inode->i_op = &v9fs_dir_inode_operations_ext;
+                else
+                        inode->i_op = &v9fs_dir_inode_operations;
+                inode->i_fop = &v9fs_dir_operations;
+                break;
+        default:
+                P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+                           mode, mode & S_IFMT);
+                err = -EINVAL;
+                goto error;
+        }
        return inode;
+error:
+        iput(inode);
+        return ERR_PTR(err);
 }
 /*
@@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        ret = NULL;
        st = p9_client_stat(fid);
-        if (IS_ERR(st)) {
+        if (IS_ERR(st))
-                err = PTR_ERR(st);
+                return ERR_CAST(st);
-                st = NULL;
-                goto error;
-        }
        umode = p9mode2unixmode(v9ses, st->mode);
        ret = v9fs_get_inode(sb, umode);
        if (IS_ERR(ret)) {
                err = PTR_ERR(ret);
-                ret = NULL;
                goto error;
        }
        v9fs_stat2inode(st, ret, sb);
        ret->i_ino = v9fs_qid2ino(&st->qid);
+        p9stat_free(st);
        kfree(st);
        return ret;
 error:
+        p9stat_free(st);
        kfree(st);
-        if (ret)
-                iput(ret);
        return ERR_PTR(err);
 }
@@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file)
 * @v9ses: session information
 * @dir: directory that dentry is being created in
 * @dentry:  dentry that is being created
+ * @extension: 9p2000.u extension string to support devices, etc.
 * @perm: create permissions
 * @mode: open mode
- * @extension: 9p2000.u extension string to support devices, etc.
 *
 */
 static struct p9_fid *
@@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
                dentry->d_op = &v9fs_dentry_operations;
        d_instantiate(dentry, inode);
-        v9fs_fid_add(dentry, fid);
+        err = v9fs_fid_add(dentry, fid);
+        if (err < 0)
+                goto error;
        return ofid;
 error:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index ab5547ff29a1..8961f1a8f668 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,7 +37,6 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -82,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
 static void
 v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
-                int flags)
+                int flags, void *data)
 {
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -92,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
            MS_NOATIME;
+        save_mount_options(sb, data);
 }
 /**
@@ -114,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        struct v9fs_session_info *v9ses = NULL;
        struct p9_wstat *st = NULL;
        int mode = S_IRWXUGO | S_ISVTX;
-        uid_t uid = current_fsuid();
-        gid_t gid = current_fsgid();
        struct p9_fid *fid;
        int retval = 0;
        P9_DPRINTK(P9_DEBUG_VFS, " \n");
-        st = NULL;
        v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
        if (!v9ses)
                return -ENOMEM;
@@ -143,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                retval = PTR_ERR(sb);
                goto free_stat;
        }
-        v9fs_fill_super(sb, v9ses, flags);
+        v9fs_fill_super(sb, v9ses, flags, data);
        inode = v9fs_get_inode(sb, S_IFDIR | mode);
        if (IS_ERR(inode)) {
@@ -151,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                goto release_sb;
        }
-        inode->i_uid = uid;
-        inode->i_gid = gid;
        root = d_alloc_root(inode);
        if (!root) {
                iput(inode);
@@ -174,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        simple_set_mnt(mnt, sb);
        return 0;
-release_sb:
-        deactivate_locked_super(sb);
 free_stat:
+        p9stat_free(st);
        kfree(st);
 clunk_fid:
@@ -186,7 +179,12 @@ clunk_fid:
 close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
+        return retval;
+release_sb:
+        p9stat_free(st);
+        kfree(st);
+        deactivate_locked_super(sb);
        return retval;
 }
@@ -208,39 +206,23 @@ static void v9fs_kill_super(struct super_block *s)
        v9fs_session_close(v9ses);
        kfree(v9ses);
+        s->s_fs_info = NULL;
        P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
 }
-/**
- * v9fs_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- *
- */
-static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-        struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
-        seq_printf(m, "%s", v9ses->options);
-        return 0;
-}
 static void
 v9fs_umount_begin(struct super_block *sb)
 {
        struct v9fs_session_info *v9ses;
-        lock_kernel();
        v9ses = sb->s_fs_info;
        v9fs_session_cancel(v9ses);
-        unlock_kernel();
 }
 static const struct super_operations v9fs_super_ops = {
        .statfs = simple_statfs,
        .clear_inode = v9fs_clear_inode,
-        .show_options = v9fs_show_options,
+        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
 };
diff --git a/fs/Kconfig b/fs/Kconfig
index 9f7270f36b2a..0e7da7bb5d93 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -39,6 +39,13 @@ config FS_POSIX_ACL
        bool
        default n
+source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
+source "fs/ocfs2/Kconfig"
+source "fs/btrfs/Kconfig"
+endif # BLOCK
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EMBEDDED
        default y
@@ -47,13 +54,6 @@ config FILE_LOCKING
          for filesystems like NFS and for the flock() system
          call. Disabling this option saves about 11k.
-source "fs/xfs/Kconfig"
-source "fs/gfs2/Kconfig"
-source "fs/ocfs2/Kconfig"
-source "fs/btrfs/Kconfig"
-endif # BLOCK
 source "fs/notify/Kconfig"
 source "fs/quota/Kconfig"
@@ -62,6 +62,16 @@ source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
+config CUSE
+        tristate "Character device in Userpace support"
+        depends on FUSE_FS
+        help
+          This FUSE extension allows character devices to be
+          implemented in userspace.
+          If you want to develop or use userspace character device
+          based on CUSE, answer Y or M.
 config GENERIC_ACL
        bool
        select FS_POSIX_ACL
@@ -124,7 +134,7 @@ config TMPFS_POSIX_ACL
 config HUGETLBFS
        bool "HugeTLB file system support"
        depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \
-                   (S390 && 64BIT) || BROKEN
+                   (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN
        help
          hugetlbfs is a filesystem backing for HugeTLB pages, based on
          ramfs. For architectures that support it, say Y here and read
@@ -176,32 +186,7 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
+source "fs/nilfs2/Kconfig"
-config NILFS2_FS
-        tristate "NILFS2 file system support (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
-        select CRC32
-        help
-          NILFS2 is a log-structured file system (LFS) supporting continuous
-          snapshotting.  In addition to versioning capability of the entire
-          file system, users can even restore files mistakenly overwritten or
-          destroyed just a few seconds ago.  Since this file system can keep
-          consistency like conventional LFS, it achieves quick recovery after
-          system crashes.
-          NILFS2 creates a number of checkpoints every few seconds or per
-          synchronous write basis (unless there is no change).  Users can
-          select significant versions among continuously created checkpoints,
-          and can change them into snapshots which will be preserved for long
-          periods until they are changed back to checkpoints.  Each
-          snapshot is mountable as a read-only file system concurrently with
-          its writable mount, and this feature is convenient for online backup.
-          Some features including atime, extended attributes, and POSIX ACLs,
-          are not supported yet.
-          To compile this file system support as a module, choose M here: the
-          module will be called nilfs2.  If unsure, say N.
 endif # MISC_FILESYSTEMS
@@ -226,10 +211,12 @@ source "fs/nfsd/Kconfig"
 config LOCKD
        tristate
+        depends on FILE_LOCKING
 config LOCKD_V4
        bool
        depends on NFSD_V3 || NFS_V3
+        depends on FILE_LOCKING
        default y
 config EXPORTFS
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index e0a85dbeeb88..9cc18775b832 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -1,3 +1,6 @@
+#include <linux/fs.h>
+#include <linux/adfs_fs.h>
 /* Internal data structures for ADFS */
 #define ADFS_FREE_FRAG           0
@@ -17,6 +20,58 @@
 struct buffer_head;
 /*
+ * adfs file system inode data in memory
+ */
+struct adfs_inode_info {
+        loff_t          mmu_private;
+        unsigned long   parent_id;      /* object id of parent          */
+        __u32           loadaddr;       /* RISC OS load address         */
+        __u32           execaddr;       /* RISC OS exec address         */
+        unsigned int    filetype;       /* RISC OS file type            */
+        unsigned int    attr;           /* RISC OS permissions          */
+        unsigned int    stamped:1;      /* RISC OS file has date/time   */
+        struct inode vfs_inode;
+};
+/*
+ * Forward-declare this
+ */
+struct adfs_discmap;
+struct adfs_dir_ops;
+/*
+ * ADFS file system superblock data in memory
+ */
+struct adfs_sb_info {
+        struct adfs_discmap *s_map;     /* bh list containing map                */
+        struct adfs_dir_ops *s_dir;     /* directory operations                  */
+        uid_t           s_uid;          /* owner uid                             */
+        gid_t           s_gid;          /* owner gid                             */
+        umode_t         s_owner_mask;   /* ADFS owner perm -> unix perm          */
+        umode_t         s_other_mask;   /* ADFS other perm -> unix perm          */
+        __u32           s_ids_per_zone; /* max. no ids in one zone               */
+        __u32           s_idlen;        /* length of ID in map                   */
+        __u32           s_map_size;     /* sector size of a map                  */
+        unsigned long   s_size;         /* total size (in blocks) of this fs     */
+        signed int      s_map2blk;      /* shift left by this for map->sector    */
+        unsigned int    s_log2sharesize;/* log2 share size                       */
+        __le32          s_version;      /* disc format version                   */
+        unsigned int    s_namelen;      /* maximum number of characters in name  */
+};
+static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
+{
+        return container_of(inode, struct adfs_inode_info, vfs_inode);
+}
+/*
 * Directory handling
 */
 struct adfs_dir {
@@ -53,6 +108,7 @@ struct adfs_dir_ops {
        int     (*update)(struct adfs_dir *dir, struct object_info *obj);
        int     (*create)(struct adfs_dir *dir, struct object_info *obj);
        int     (*remove)(struct adfs_dir *dir, struct object_info *obj);
+        int     (*sync)(struct adfs_dir *dir);
        void    (*free)(struct adfs_dir *dir);
 };
@@ -90,7 +146,8 @@ extern const struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
-extern int adfs_dir_update(struct super_block *sb, struct object_info *obj);
+extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
+                           int wait);
 /* file.c */
 extern const struct inode_operations adfs_file_inode_operations;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index e867ccf37246..23aa52f548a0 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,15 +9,7 @@
 *
 *  Common directory handling for ADFS
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>          /* for file_fsync() */
 #include "adfs.h"
 /*
@@ -83,7 +75,7 @@ out:
 }
 int
-adfs_dir_update(struct super_block *sb, struct object_info *obj)
+adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
 {
        int ret = -EINVAL;
 #ifdef CONFIG_ADFS_FS_RW
@@ -106,6 +98,12 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj)
        ret = ops->update(&dir, obj);
        write_unlock(&adfs_dir_lock);
+        if (wait) {
+                int err = ops->sync(&dir);
+                if (!ret)
+                        ret = err;
+        }
        ops->free(&dir);
 out:
 #endif
@@ -199,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = adfs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
 };
 static int
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index ea7df2146921..bafc71222e25 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -9,15 +9,7 @@
 *
 *  E and F format directory handling
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-#include <linux/string.h>
 #include "adfs.h"
 #include "dir_f.h"
@@ -437,6 +429,22 @@ bad_dir:
 #endif
 }
+static int
+adfs_f_sync(struct adfs_dir *dir)
+{
+        int err = 0;
+        int i;
+        for (i = dir->nr_buffers - 1; i >= 0; i--) {
+                struct buffer_head *bh = dir->bh[i];
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        err = -EIO;
+        }
+        return err;
+}
 static void
 adfs_f_free(struct adfs_dir *dir)
 {
@@ -456,5 +464,6 @@ struct adfs_dir_ops adfs_f_dir_ops = {
        .setpos         = adfs_f_setpos,
        .getnext        = adfs_f_getnext,
        .update         = adfs_f_update,
+        .sync           = adfs_f_sync,
        .free           = adfs_f_free
 };
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1ec644e32df9..1796bb352d05 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -7,15 +7,7 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-#include <linux/string.h>
 #include "adfs.h"
 #include "dir_fplus.h"
@@ -161,6 +153,22 @@ out:
        return ret;
 }
+static int
+adfs_fplus_sync(struct adfs_dir *dir)
+{
+        int err = 0;
+        int i;
+        for (i = dir->nr_buffers - 1; i >= 0; i--) {
+                struct buffer_head *bh = dir->bh[i];
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        err = -EIO;
+        }
+        return err;
+}
 static void
 adfs_fplus_free(struct adfs_dir *dir)
 {
@@ -175,5 +183,6 @@ struct adfs_dir_ops adfs_fplus_dir_ops = {
        .read           = adfs_fplus_read,
        .setpos         = adfs_fplus_setpos,
        .getnext        = adfs_fplus_getnext,
+        .sync           = adfs_fplus_sync,
        .free           = adfs_fplus_free
 };
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 36e381c6a99a..005ea34d1758 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -19,10 +19,6 @@
 *
 *  adfs regular file handling primitives           
 */
-#include <linux/fs.h>
-#include <linux/buffer_head.h>                  /* for file_fsync() */
-#include <linux/adfs_fs.h>
 #include "adfs.h"
 const struct file_operations adfs_file_operations = {
@@ -30,7 +26,7 @@ const struct file_operations adfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .mmap           = generic_file_mmap,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e647200262a2..798cb071d132 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,17 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/mm.h>
 #include <linux/smp_lock.h>
-#include <linux/module.h>
 #include <linux/buffer_head.h>
 #include "adfs.h"
 /*
@@ -376,7 +367,7 @@ out:
 * The adfs-specific inode data has already been updated by
 * adfs_notify_change()
 */
-int adfs_write_inode(struct inode *inode, int unused)
+int adfs_write_inode(struct inode *inode, int wait)
 {
        struct super_block *sb = inode->i_sb;
        struct object_info obj;
@@ -391,8 +382,7 @@ int adfs_write_inode(struct inode *inode, int unused)
        obj.attr        = ADFS_I(inode)->attr;
        obj.size        = inode->i_size;
-        ret = adfs_dir_update(sb, &obj);
+        ret = adfs_dir_update(sb, &obj, wait);
        unlock_kernel();
        return ret;
 }
-MODULE_LICENSE("GPL");
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 92ab4fbc2031..d1a5932bb0f1 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -7,14 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
 #include <asm/unaligned.h>
 #include "adfs.h"
 /*
@@ -62,7 +56,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
 #define GET_FRAG_ID(_map,_start,_idmask)                                \
        ({                                                              \
                unsigned char *_m = _map + (_start >> 3);               \
-                u32 _frag = get_unaligned((u32 *)_m);                   \
+                u32 _frag = get_unaligned_le32(_m);                     \
                _frag >>= (_start & 7);                                 \
                _frag & _idmask;                                        \
        })
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index dd9becca4241..6910a98bd73c 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -8,26 +8,13 @@
 * published by the Free Software Foundation.
 */
 #include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
-#include <linux/vfs.h>
 #include <linux/parser.h>
-#include <linux/bitops.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
-#include <asm/uaccess.h>
+#include <linux/statfs.h>
-#include <asm/system.h>
-#include <stdarg.h>
 #include "adfs.h"
 #include "dir_f.h"
 #include "dir_fplus.h"
@@ -132,11 +119,15 @@ static void adfs_put_super(struct super_block *sb)
        int i;
        struct adfs_sb_info *asb = ADFS_SB(sb);
+        lock_kernel();
        for (i = 0; i < asb->s_map_size; i++)
                brelse(asb->s_map[i].dm_bh);
        kfree(asb->s_map);
        kfree(asb);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -530,3 +521,4 @@ static void __exit exit_adfs_fs(void)
 module_init(init_adfs_fs)
 module_exit(exit_adfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 1a2d5e3c7f4e..e511dc621a2e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -182,6 +182,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 void            affs_free_prealloc(struct inode *inode);
 extern void     affs_truncate(struct inode *);
+int             affs_file_fsync(struct file *, struct dentry *, int);
 /* dir.c */
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 7b36904dbeac..8ca8f3a55599 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = affs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = affs_file_fsync,
 };
 /*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 9246cb4aa018..184e55c1c9ba 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = {
        .mmap           = generic_file_mmap,
        .open           = affs_file_open,
        .release        = affs_file_release,
-        .fsync          = file_fsync,
+        .fsync          = affs_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -915,3 +915,15 @@ affs_truncate(struct inode *inode)
        }
        affs_free_prealloc(inode);
 }
+int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+        struct inode * inode = dentry->d_inode;
+        int ret, err;
+        ret = write_inode_now(inode, 0);
+        err = sync_blockdev(inode->i_sb->s_bdev);
+        if (!ret)
+                ret = err;
+        return ret;
+}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 63f5183f263b..104fdcb3a7fc 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "affs.h"
 extern struct timezone sys_tz;
@@ -24,49 +25,67 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 static void
+affs_commit_super(struct super_block *sb, int clean)
+{
+        struct affs_sb_info *sbi = AFFS_SB(sb);
+        struct buffer_head *bh = sbi->s_root_bh;
+        struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
+        tail->bm_flag = cpu_to_be32(clean);
+        secs_to_datestamp(get_seconds(), &tail->disk_change);
+        affs_fix_checksum(sb, bh);
+        mark_buffer_dirty(bh);
+}
+static void
 affs_put_super(struct super_block *sb)
 {
        struct affs_sb_info *sbi = AFFS_SB(sb);
        pr_debug("AFFS: put_super()\n");
-        if (!(sb->s_flags & MS_RDONLY)) {
+        lock_kernel();
-                AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1);
-                secs_to_datestamp(get_seconds(),
+        if (!(sb->s_flags & MS_RDONLY))
-                                  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
+                affs_commit_super(sb, 1);
-                affs_fix_checksum(sb, sbi->s_root_bh);
-                mark_buffer_dirty(sbi->s_root_bh);
-        }
        kfree(sbi->s_prefix);
        affs_free_bitmap(sb);
        affs_brelse(sbi->s_root_bh);
        kfree(sbi);
        sb->s_fs_info = NULL;
-        return;
+        unlock_kernel();
 }
 static void
 affs_write_super(struct super_block *sb)
 {
        int clean = 2;
-        struct affs_sb_info *sbi = AFFS_SB(sb);
+        lock_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                //      if (sbi->s_bitmap[i].bm_bh) {
                //              if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
                //                      clean = 0;
-                AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean);
+                affs_commit_super(sb, clean);
-                secs_to_datestamp(get_seconds(),
-                                  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
-                affs_fix_checksum(sb, sbi->s_root_bh);
-                mark_buffer_dirty(sbi->s_root_bh);
                sb->s_dirt = !clean;    /* redo until bitmap synced */
        } else
                sb->s_dirt = 0;
+        unlock_super(sb);
        pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
 }
+static int
+affs_sync_fs(struct super_block *sb, int wait)
+{
+        lock_super(sb);
+        affs_commit_super(sb, 2);
+        sb->s_dirt = 0;
+        unlock_super(sb);
+        return 0;
+}
 static struct kmem_cache * affs_inode_cachep;
 static struct inode *affs_alloc_inode(struct super_block *sb)
@@ -124,6 +143,7 @@ static const struct super_operations affs_sops = {
        .clear_inode    = affs_clear_inode,
        .put_super      = affs_put_super,
        .write_super    = affs_write_super,
+        .sync_fs        = affs_sync_fs,
        .statfs         = affs_statfs,
        .remount_fs     = affs_remount,
        .show_options   = generic_show_options,
@@ -507,6 +527,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
                kfree(new_opts);
                return -EINVAL;
        }
+        lock_kernel();
        replace_mount_options(sb, new_opts);
        sbi->s_flags = mount_flags;
@@ -514,8 +535,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        sbi->s_uid   = uid;
        sbi->s_gid   = gid;
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+                unlock_kernel();
                return 0;
+        }
        if (*flags & MS_RDONLY) {
                sb->s_dirt = 1;
                while (sb->s_dirt)
@@ -524,6 +547,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        } else
                res = affs_init_bitmap(sb, flags);
+        unlock_kernel();
        return res;
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 9bd757774c9e..88067f36e5e7 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -564,7 +564,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct afs_vnode *vnode, *dir;
-        struct afs_fid fid;
+        struct afs_fid uninitialized_var(fid);
        struct dentry *parent;
        struct key *key;
        void *dir_version;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0149dab365e7..681c2a7b013f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page)
        inode = page->mapping->host;
-        ASSERT(file != NULL);
+        if (file) {
-        key = file->private_data;
+                key = file->private_data;
-        ASSERT(key != NULL);
+                ASSERT(key != NULL);
+        } else {
+                key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+                if (IS_ERR(key)) {
+                        ret = PTR_ERR(key);
+                        goto error_nokey;
+                }
+        }
        _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
@@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page)
                unlock_page(page);
        }
+        if (!file)
+                key_put(key);
        _leave(" = 0");
        return 0;
 error:
        SetPageError(page);
        unlock_page(page);
+        if (!file)
+                key_put(key);
+error_nokey:
        _leave(" = %d", ret);
        return ret;
 }
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 210acafe4a9b..3ff8bdd18fb3 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -432,7 +432,6 @@ vfs_rejected_lock:
        list_del_init(&fl->fl_u.afs.link);
        if (list_empty(&vnode->granted_locks))
                afs_defer_unlock(vnode, key);
-        spin_unlock(&vnode->lock);
        goto abort_attempt;
 }
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 2d33a5f7d218..0dd4dafee10b 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
+#include <rxrpc/packet.h>
 #include "internal.h"
 #include "afs_fs.h"
@@ -54,6 +55,21 @@ int afs_abort_to_error(u32 abort_code)
        case 0x2f6df24:         return -ENOLCK;
        case 0x2f6df26:         return -ENOTEMPTY;
        case 0x2f6df78:         return -EDQUOT;
+        case RXKADINCONSISTENCY: return -EPROTO;
+        case RXKADPACKETSHORT:  return -EPROTO;
+        case RXKADLEVELFAIL:    return -EKEYREJECTED;
+        case RXKADTICKETLEN:    return -EKEYREJECTED;
+        case RXKADOUTOFSEQUENCE: return -EPROTO;
+        case RXKADNOAUTH:       return -EKEYREJECTED;
+        case RXKADBADKEY:       return -EKEYREJECTED;
+        case RXKADBADTICKET:    return -EKEYREJECTED;
+        case RXKADUNKNOWNKEY:   return -EKEYREJECTED;
+        case RXKADEXPIRED:      return -EKEYEXPIRED;
+        case RXKADSEALEDINCON:  return -EKEYREJECTED;
+        case RXKADDATALEN:      return -EKEYREJECTED;
+        case RXKADILLEGALLEVEL: return -EKEYREJECTED;
        default:                return -EREMOTEIO;
        }
 }
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2b9e2d03a390..5ffb570cd3a8 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -17,7 +17,6 @@
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/mnt_namespace.h>
 #include "internal.h"
@@ -244,7 +243,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
        case -EBUSY:
                /* someone else made a mount here whilst we were busy */
                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path.mnt, &nd->path.dentry))
+                       follow_down(&nd->path))
                        ;
                err = 0;
        default:
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 76828e5f8a39..e1ea1c240b6a 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/parser.h>
@@ -440,8 +441,12 @@ static void afs_put_super(struct super_block *sb)
        _enter("");
+        lock_kernel();
        afs_put_volume(as->volume);
+        unlock_kernel();
        _leave("");
 }
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index ec2a7431e458..6e689208def2 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -65,6 +65,8 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
                                goto out;
                        goto rotate;
                case -ENOMEDIUM:
+                case -EKEYREJECTED:
+                case -EKEYEXPIRED:
                        goto out;
                default:
                        ret = -EIO;
diff --git a/fs/aio.c b/fs/aio.c
index 76da12537956..d065b2c3273e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -485,6 +485,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 {
        assert_spin_locked(&ctx->ctx_lock);
+        if (req->ki_eventfd != NULL)
+                eventfd_ctx_put(req->ki_eventfd);
        if (req->ki_dtor)
                req->ki_dtor(req);
        if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work_struct *data)
                /* Complete the fput(s) */
                if (req->ki_filp != NULL)
                        __fput(req->ki_filp);
-                if (req->ki_eventfd != NULL)
-                        __fput(req->ki_eventfd);
                /* Link the iocb into the context's free list */
                spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work_struct *data)
 */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-        int schedule_putreq = 0;
        dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
                req, atomic_long_read(&req->ki_filp->f_count));
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
         * we would not be holding the last reference to the file*, so
         * this function will be executed w/out any aio kthread wakeup.
         */
-        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
+        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
-                schedule_putreq++;
-        else
-                req->ki_filp = NULL;
-        if (req->ki_eventfd != NULL) {
-                if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
-                        schedule_putreq++;
-                else
-                        req->ki_eventfd = NULL;
-        }
-        if (unlikely(schedule_putreq)) {
                get_ioctx(ctx);
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
                spin_unlock(&fput_lock);
                queue_work(aio_wq, &fput_work);
-        } else
+        } else {
+                req->ki_filp = NULL;
                really_put_req(ctx, req);
+        }
        return 1;
 }
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                 * an eventfd() fd, and will be signaled for each completed
                 * event using the eventfd_signal() function.
                 */
-                req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+                req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
                if (IS_ERR(req->ki_eventfd)) {
                        ret = PTR_ERR(req->ki_eventfd);
                        req->ki_eventfd = NULL;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 1dd96d4406c0..47d4a01c5393 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -52,6 +52,19 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
        .d_delete       = anon_inodefs_delete_dentry,
 };
+/*
+ * nop .set_page_dirty method so that people can use .page_mkwrite on
+ * anon inodes.
+ */
+static int anon_set_page_dirty(struct page *page)
+{
+        return 0;
+};
+static const struct address_space_operations anon_aops = {
+        .set_page_dirty = anon_set_page_dirty,
+};
 /**
 * anon_inode_getfd - creates a new file instance by hooking it up to an
 *                    anonymous inode, and a dentry that describe the "class"
@@ -151,6 +164,8 @@ static struct inode *anon_inode_mkinode(void)
        inode->i_fop = &anon_inode_fops;
+        inode->i_mapping->a_ops = &anon_aops;
        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 4eb4d8dfb2f1..2316e944a109 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -85,13 +85,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
                }
                path.mnt = mnt;
                path_get(&path);
-                if (!follow_down(&path.mnt, &path.dentry)) {
+                if (!follow_down(&path)) {
                        path_put(&path);
                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
                        continue;
                }
-                while (d_mountpoint(path.dentry) &&
+                while (d_mountpoint(path.dentry) && follow_down(&path));
-                       follow_down(&path.mnt, &path.dentry))
                        ;
                umount_ok = may_umount(path.mnt);
                path_put(&path);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b7ff33c63101..8f7cdde41733 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
-static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static inline int autofs4_follow_mount(struct path *path)
 {
        int res = 0;
-        while (d_mountpoint(*dentry)) {
+        while (d_mountpoint(path->dentry)) {
-                int followed = follow_down(mnt, dentry);
+                int followed = follow_down(path);
                if (!followed)
                        break;
                res = 1;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 84168c0dcc2d..00bf8fcb245f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -19,7 +19,6 @@
 #include <linux/sched.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
-#include <linux/smp_lock.h>
 #include <linux/magic.h>
 #include <linux/dcache.h>
 #include <linux/uaccess.h>
@@ -192,77 +191,42 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
        return 0;
 }
-/*
+static int find_autofs_mount(const char *pathname,
- * Walk down the mount stack looking for an autofs mount that
+                             struct path *res,
- * has the requested device number (aka. new_encode_dev(sb->s_dev).
+                             int test(struct path *path, void *data),
- */
+                             void *data)
-static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
 {
-        struct dentry *dentry;
+        struct path path;
-        struct inode *inode;
+        int err = kern_path(pathname, 0, &path);
-        struct super_block *sb;
+        if (err)
-        dev_t s_dev;
+                return err;
-        unsigned int err;
        err = -ENOENT;
+        while (path.dentry == path.mnt->mnt_root) {
-        /* Lookup the dentry name at the base of our mount point */
+                if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
-        dentry = d_lookup(nd->path.dentry, &nd->last);
+                        if (test(&path, data)) {
-        if (!dentry)
+                                path_get(&path);
-                goto out;
+                                if (!err) /* already found some */
+                                        path_put(res);
-        dput(nd->path.dentry);
+                                *res = path;
-        nd->path.dentry = dentry;
-        /* And follow the mount stack looking for our autofs mount */
-        while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
-                inode = nd->path.dentry->d_inode;
-                if (!inode)
-                        break;
-                sb = inode->i_sb;
-                s_dev = new_encode_dev(sb->s_dev);
-                if (devno == s_dev) {
-                        if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
                                err = 0;
-                                break;
                        }
                }
+                if (!follow_up(&path))
+                        break;
        }
-out:
+        path_put(&path);
        return err;
 }
-/*
+static int test_by_dev(struct path *path, void *p)
- * Walk down the mount stack looking for an autofs mount that
- * has the requested mount type (ie. indirect, direct or offset).
- */
-static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
 {
-        struct dentry *dentry;
+        return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
-        struct autofs_info *ino;
+}
-        unsigned int err;
-        err = -ENOENT;
-        /* Lookup the dentry name at the base of our mount point */
-        dentry = d_lookup(nd->path.dentry, &nd->last);
-        if (!dentry)
-                goto out;
-        dput(nd->path.dentry);
-        nd->path.dentry = dentry;
-        /* And follow the mount stack looking for our autofs mount */
+static int test_by_type(struct path *path, void *p)
-        while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
+{
-                ino = autofs4_dentry_ino(nd->path.dentry);
+        struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
-                if (ino && ino->sbi->type & type) {
+        return ino && ino->sbi->type & *(unsigned *)p;
-                        err = 0;
-                        break;
-                }
-        }
-out:
-        return err;
 }
 static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
@@ -283,31 +247,25 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
 * Open a file descriptor on the autofs mount point corresponding
 * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
 */
-static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
+static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 {
-        struct file *filp;
-        struct nameidata nd;
        int err, fd;
        fd = get_unused_fd();
        if (likely(fd >= 0)) {
-                /* Get nameidata of the parent directory */
+                struct file *filp;
-                err = path_lookup(path, LOOKUP_PARENT, &nd);
+                struct path path;
+                err = find_autofs_mount(name, &path, test_by_dev, &devid);
                if (err)
                        goto out;
                /*
-                 * Search down, within the parent, looking for an
+                 * Find autofs super block that has the device number
-                 * autofs super block that has the device number
                 * corresponding to the autofs fs we want to open.
                 */
-                err = autofs_dev_ioctl_find_super(&nd, devid);
-                if (err) {
-                        path_put(&nd.path);
-                        goto out;
-                }
-                filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
+                filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
                                   current_cred());
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
@@ -340,7 +298,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
        param->ioctlfd = -1;
        path = param->path;
-        devid = param->openmount.devid;
+        devid = new_decode_dev(param->openmount.devid);
        err = 0;
        fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -475,8 +433,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                                      struct autofs_dev_ioctl *param)
 {
        struct autofs_info *ino;
-        struct nameidata nd;
+        struct path path;
-        const char *path;
        dev_t devid;
        int err = -ENOENT;
@@ -485,32 +442,24 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                goto out;
        }
-        path = param->path;
+        devid = sbi->sb->s_dev;
-        devid = new_encode_dev(sbi->sb->s_dev);
        param->requester.uid = param->requester.gid = -1;
-        /* Get nameidata of the parent directory */
+        err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
-        err = path_lookup(path, LOOKUP_PARENT, &nd);
        if (err)
                goto out;
-        err = autofs_dev_ioctl_find_super(&nd, devid);
+        ino = autofs4_dentry_ino(path.dentry);
-        if (err)
-                goto out_release;
-        ino = autofs4_dentry_ino(nd.path.dentry);
        if (ino) {
                err = 0;
-                autofs4_expire_wait(nd.path.dentry);
+                autofs4_expire_wait(path.dentry);
                spin_lock(&sbi->fs_lock);
                param->requester.uid = ino->uid;
                param->requester.gid = ino->gid;
                spin_unlock(&sbi->fs_lock);
        }
+        path_put(&path);
-out_release:
-        path_put(&nd.path);
 out:
        return err;
 }
@@ -569,8 +518,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                                         struct autofs_sb_info *sbi,
                                         struct autofs_dev_ioctl *param)
 {
-        struct nameidata nd;
+        struct path path;
-        const char *path;
+        const char *name;
        unsigned int type;
        unsigned int devid, magic;
        int err = -ENOENT;
@@ -580,71 +529,46 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                goto out;
        }
-        path = param->path;
+        name = param->path;
        type = param->ismountpoint.in.type;
        param->ismountpoint.out.devid = devid = 0;
        param->ismountpoint.out.magic = magic = 0;
        if (!fp || param->ioctlfd == -1) {
-                if (autofs_type_any(type)) {
+                if (autofs_type_any(type))
-                        struct super_block *sb;
+                        err = kern_path(name, LOOKUP_FOLLOW, &path);
+                else
-                        err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+                        err = find_autofs_mount(name, &path, test_by_type, &type);
-                        if (err)
+                if (err)
-                                goto out;
+                        goto out;
+                devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
-                        sb = nd.path.dentry->d_sb;
-                        devid = new_encode_dev(sb->s_dev);
-                } else {
-                        struct autofs_info *ino;
-                        err = path_lookup(path, LOOKUP_PARENT, &nd);
-                        if (err)
-                                goto out;
-                        err = autofs_dev_ioctl_find_sbi_type(&nd, type);
-                        if (err)
-                                goto out_release;
-                        ino = autofs4_dentry_ino(nd.path.dentry);
-                        devid = autofs4_get_dev(ino->sbi);
-                }
                err = 0;
-                if (nd.path.dentry->d_inode &&
+                if (path.dentry->d_inode &&
-                    nd.path.mnt->mnt_root == nd.path.dentry) {
+                    path.mnt->mnt_root == path.dentry) {
                        err = 1;
-                        magic = nd.path.dentry->d_inode->i_sb->s_magic;
+                        magic = path.dentry->d_inode->i_sb->s_magic;
                }
        } else {
-                dev_t dev = autofs4_get_dev(sbi);
+                dev_t dev = sbi->sb->s_dev;
-                err = path_lookup(path, LOOKUP_PARENT, &nd);
+                err = find_autofs_mount(name, &path, test_by_dev, &dev);
                if (err)
                        goto out;
-                err = autofs_dev_ioctl_find_super(&nd, dev);
+                devid = new_encode_dev(dev);
-                if (err)
-                        goto out_release;
-                devid = dev;
-                err = have_submounts(nd.path.dentry);
+                err = have_submounts(path.dentry);
-                if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
+                if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
-                        if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
+                        if (follow_down(&path))
-                                struct inode *inode = nd.path.dentry->d_inode;
+                                magic = path.mnt->mnt_sb->s_magic;
-                                magic = inode->i_sb->s_magic;
-                        }
                }
        }
        param->ismountpoint.out.devid = devid;
        param->ismountpoint.out.magic = magic;
+        path_put(&path);
-out_release:
-        path_put(&nd.path);
 out:
        return err;
 }
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3077d8f16523..aa39ae83f019 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 {
        struct dentry *top = dentry;
+        struct path path = {.mnt = mnt, .dentry = dentry};
        int status = 1;
        DPRINTK("dentry %p %.*s",
                dentry, (int)dentry->d_name.len, dentry->d_name.name);
-        mntget(mnt);
+        path_get(&path);
-        dget(dentry);
-        if (!follow_down(&mnt, &dentry))
+        if (!follow_down(&path))
                goto done;
-        if (is_autofs4_dentry(dentry)) {
+        if (is_autofs4_dentry(path.dentry)) {
-                struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+                struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
                /* This is an autofs submount, we can't expire it */
                if (autofs_type_indirect(sbi->type))
@@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
                 * Otherwise it's an offset mount and we need to check
                 * if we can umount its mount, if there is one.
                 */
-                if (!d_mountpoint(dentry)) {
+                if (!d_mountpoint(path.dentry)) {
                        status = 0;
                        goto done;
                }
@@ -86,8 +86,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        status = 0;
 done:
        DPRINTK("returning = %d", status);
-        dput(dentry);
+        path_put(&path);
-        mntput(mnt);
        return status;
 }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e383bf0334f1..b96a3c57359d 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                nd->flags);
        /*
         * For an expire of a covered direct or offset mount we need
-         * to beeak out of follow_down() at the autofs mount trigger
+         * to break out of follow_down() at the autofs mount trigger
         * (d_mounted--), so we can see the expiring flag, and manage
         * the blocking and following here until the expire is completed.
         */
@@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                if (ino->flags & AUTOFS_INF_EXPIRING) {
                        spin_unlock(&sbi->fs_lock);
                        /* Follow down to our covering mount. */
-                        if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+                        if (!follow_down(&nd->path))
                                goto done;
                        goto follow;
                }
@@ -230,8 +230,7 @@ follow:
         * to follow it.
         */
        if (d_mountpoint(dentry)) {
-                if (!autofs4_follow_mount(&nd->path.mnt,
+                if (!autofs4_follow_mount(&nd->path)) {
-                                          &nd->path.dentry)) {
                        status = -ENOENT;
                        goto out_error;
                }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 76afd0d6b86c..615d5496fe0f 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
 {
        struct nls_table *nls = BEFS_SB(sb)->nls;
        int i, o;
-        wchar_t uni;
+        unicode_t uni;
        int unilen, utflen;
        char *result;
        /* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
        for (i = o = 0; i < in_len; i += utflen, o += unilen) {
                /* convert from UTF-8 to Unicode */
-                utflen = utf8_mbtowc(&uni, &in[i], in_len - i);
+                utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
-                if (utflen < 0) {
+                if (utflen < 0)
                        goto conv_err;
-                }
                /* convert from Unicode to nls */
+                if (uni > MAX_WCHAR_T)
+                        goto conv_err;
                unilen = nls->uni2char(uni, &result[o], in_len - o);
-                if (unilen < 0) {
+                if (unilen < 0)
                        goto conv_err;
-                }
        }
        result[o] = '\0';
        *out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in,
                /* convert from nls to unicode */
                unilen = nls->char2uni(&in[i], in_len - i, &uni);
-                if (unilen < 0) {
+                if (unilen < 0)
                        goto conv_err;
-                }
                /* convert from unicode to UTF-8 */
-                utflen = utf8_wctomb(&result[o], uni, 3);
+                utflen = utf32_to_utf8(uni, &result[o], 3);
-                if (utflen <= 0) {
+                if (utflen <= 0)
                        goto conv_err;
-                }
        }
        result[o] = '\0';
@@ -747,7 +745,6 @@ befs_put_super(struct super_block *sb)
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        return;
 }
 /* Allocate private field of the superblock, fill it.
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 4dd1b623f937..1e41aadb1068 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -8,7 +8,6 @@
 #include <linux/time.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include "bfs.h"
@@ -79,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = bfs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
        .llseek         = generic_file_llseek,
 };
@@ -205,7 +204,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
                inode->i_nlink = 1;
        }
        de->ino = 0;
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        mark_inode_dirty(dir);
        inode->i_ctime = dir->i_ctime;
@@ -267,7 +266,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_inode->i_ctime = CURRENT_TIME_SEC;
                inode_dec_link_count(new_inode);
        }
-        mark_buffer_dirty(old_bh);
+        mark_buffer_dirty_inode(old_bh, old_dir);
        error = 0;
 end_rename:
@@ -320,7 +319,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
                                for (i = 0; i < BFS_NAMELEN; i++)
                                        de->name[i] =
                                                (i < namelen) ? name[i] : 0;
-                                mark_buffer_dirty(bh);
+                                mark_buffer_dirty_inode(bh, dir);
                                brelse(bh);
                                return 0;
                        }
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 6a021265f018..88b9a3ff44e4 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -11,7 +11,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "bfs.h"
 #undef DEBUG
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index cc4062d12ca2..6f60336c6628 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,6 +30,7 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
+static void bfs_write_super(struct super_block *s);
 void dump_imap(const char *prefix, struct super_block *s);
 struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -97,14 +98,15 @@ error:
        return ERR_PTR(-EIO);
 }
-static int bfs_write_inode(struct inode *inode, int unused)
+static int bfs_write_inode(struct inode *inode, int wait)
 {
+        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
        unsigned int ino = (u16)inode->i_ino;
        unsigned long i_sblock;
        struct bfs_inode *di;
        struct buffer_head *bh;
        int block, off;
-        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+        int err = 0;
        dprintf("ino=%08x\n", ino);
@@ -145,9 +147,14 @@ static int bfs_write_inode(struct inode *inode, int unused)
        di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
        mark_buffer_dirty(bh);
+        if (wait) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        err = -EIO;
+        }
        brelse(bh);
        mutex_unlock(&info->bfs_lock);
-        return 0;
+        return err;
 }
 static void bfs_delete_inode(struct inode *inode)
@@ -209,6 +216,26 @@ static void bfs_delete_inode(struct inode *inode)
        clear_inode(inode);
 }
+static int bfs_sync_fs(struct super_block *sb, int wait)
+{
+        struct bfs_sb_info *info = BFS_SB(sb);
+        mutex_lock(&info->bfs_lock);
+        mark_buffer_dirty(info->si_sbh);
+        sb->s_dirt = 0;
+        mutex_unlock(&info->bfs_lock);
+        return 0;
+}
+static void bfs_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                bfs_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
+}
 static void bfs_put_super(struct super_block *s)
 {
        struct bfs_sb_info *info = BFS_SB(s);
@@ -216,11 +243,18 @@ static void bfs_put_super(struct super_block *s)
        if (!info)
                return;
+        lock_kernel();
+        if (s->s_dirt)
+                bfs_write_super(s);
        brelse(info->si_sbh);
        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
        kfree(info);
        s->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -240,17 +274,6 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static void bfs_write_super(struct super_block *s)
-{
-        struct bfs_sb_info *info = BFS_SB(s);
-        mutex_lock(&info->bfs_lock);
-        if (!(s->s_flags & MS_RDONLY))
-                mark_buffer_dirty(info->si_sbh);
-        s->s_dirt = 0;
-        mutex_unlock(&info->bfs_lock);
-}
 static struct kmem_cache *bfs_inode_cachep;
 static struct inode *bfs_alloc_inode(struct super_block *sb)
@@ -298,6 +321,7 @@ static const struct super_operations bfs_sops = {
        .delete_inode   = bfs_delete_inode,
        .put_super      = bfs_put_super,
        .write_super    = bfs_write_super,
+        .sync_fs        = bfs_sync_fs,
        .statfs         = bfs_statfs,
 };
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 40381df34869..b7c1603cd4bd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1340,8 +1340,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
        prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
        prstatus->pr_sigpend = p->pending.signal.sig[0];
        prstatus->pr_sighold = p->blocked.sig[0];
+        rcu_read_lock();
+        prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+        rcu_read_unlock();
        prstatus->pr_pid = task_pid_vnr(p);
-        prstatus->pr_ppid = task_pid_vnr(p->real_parent);
        prstatus->pr_pgrp = task_pgrp_vnr(p);
        prstatus->pr_sid = task_session_vnr(p);
        if (thread_group_leader(p)) {
@@ -1382,8 +1384,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
                        psinfo->pr_psargs[i] = ' ';
        psinfo->pr_psargs[len] = 0;
+        rcu_read_lock();
+        psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+        rcu_read_unlock();
        psinfo->pr_pid = task_pid_vnr(p);
-        psinfo->pr_ppid = task_pid_vnr(p->real_parent);
        psinfo->pr_pgrp = task_pgrp_vnr(p);
        psinfo->pr_sid = task_session_vnr(p);
@@ -1518,11 +1522,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
        info->thread = NULL;
        psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-        fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
        if (psinfo == NULL)
                return 0;
+        fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
        /*
         * Figure out how many notes we're going to need for each thread.
         */
@@ -1925,7 +1929,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
        elf = kmalloc(sizeof(*elf), GFP_KERNEL);
        if (!elf)
                goto out;
-        
+        /*
+         * The number of segs are recored into ELF header as 16bit value.
+         * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
+         */
        segs = current->mm->map_count;
 #ifdef ELF_CORE_EXTRA_PHDRS
        segs += ELF_CORE_EXTRA_PHDRS;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fdb66faa24f1..20fbeced472b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1387,8 +1387,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
        prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
        prstatus->pr_sigpend = p->pending.signal.sig[0];
        prstatus->pr_sighold = p->blocked.sig[0];
+        rcu_read_lock();
+        prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+        rcu_read_unlock();
        prstatus->pr_pid = task_pid_vnr(p);
-        prstatus->pr_ppid = task_pid_vnr(p->real_parent);
        prstatus->pr_pgrp = task_pgrp_vnr(p);
        prstatus->pr_sid = task_session_vnr(p);
        if (thread_group_leader(p)) {
@@ -1432,8 +1434,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
                        psinfo->pr_psargs[i] = ' ';
        psinfo->pr_psargs[len] = 0;
+        rcu_read_lock();
+        psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+        rcu_read_unlock();
        psinfo->pr_pid = task_pid_vnr(p);
-        psinfo->pr_ppid = task_pid_vnr(p->real_parent);
        psinfo->pr_pgrp = task_pgrp_vnr(p);
        psinfo->pr_sid = task_session_vnr(p);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 697f6b5f1313..e92f229e3c6e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        if (IS_ERR(bprm.file))
                return res;
+        bprm.cred = prepare_exec_creds();
+        res = -ENOMEM;
+        if (!bprm.cred)
+                goto out;
        res = prepare_binprm(&bprm);
        if (res <= (unsigned long)-4096)
                res = load_flat_file(&bprm, libs, id, NULL);
-        if (bprm.file) {
-                allow_write_access(bprm.file);
+        abort_creds(bprm.cred);
-                fput(bprm.file);
-                bprm.file = NULL;
+out:
-        }
+        allow_write_access(bprm.file);
+        fput(bprm.file);
        return(res);
 }
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 31c46a241bac..49a34e7f7306 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -1,7 +1,7 @@
 /*
 * bio-integrity.c - bio data integrity extensions
 *
- * Copyright (C) 2007, 2008 Oracle Corporation
+ * Copyright (C) 2007, 2008, 2009 Oracle Corporation
 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
 *
 * This program is free software; you can redistribute it and/or
@@ -25,63 +25,121 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
-static struct kmem_cache *bio_integrity_slab __read_mostly;
+struct integrity_slab {
-static mempool_t *bio_integrity_pool;
+        struct kmem_cache *slab;
-static struct bio_set *integrity_bio_set;
+        unsigned short nr_vecs;
+        char name[8];
+};
+#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
+struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
+        IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
+};
+#undef IS
 static struct workqueue_struct *kintegrityd_wq;
+static inline unsigned int vecs_to_idx(unsigned int nr)
+{
+        switch (nr) {
+        case 1:
+                return 0;
+        case 2 ... 4:
+                return 1;
+        case 5 ... 16:
+                return 2;
+        case 17 ... 64:
+                return 3;
+        case 65 ... 128:
+                return 4;
+        case 129 ... BIO_MAX_PAGES:
+                return 5;
+        default:
+                BUG();
+        }
+}
+static inline int use_bip_pool(unsigned int idx)
+{
+        if (idx == BIOVEC_NR_POOLS)
+                return 1;
+        return 0;
+}
 /**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
 * @bio:        bio to attach integrity metadata to
 * @gfp_mask:   Memory allocation mask
 * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ * @bs:         bio_set to allocate from
 *
 * Description: This function prepares a bio for attaching integrity
 * metadata.  nr_vecs specifies the maximum number of pages containing
 * integrity metadata that can be attached.
 */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
-                                                  gfp_t gfp_mask,
+                                                         gfp_t gfp_mask,
-                                                  unsigned int nr_vecs)
+                                                         unsigned int nr_vecs,
+                                                         struct bio_set *bs)
 {
        struct bio_integrity_payload *bip;
-        struct bio_vec *iv;
+        unsigned int idx = vecs_to_idx(nr_vecs);
-        unsigned long idx;
        BUG_ON(bio == NULL);
+        bip = NULL;
-        bip = mempool_alloc(bio_integrity_pool, gfp_mask);
+        /* Lower order allocations come straight from slab */
-        if (unlikely(bip == NULL)) {
+        if (!use_bip_pool(idx))
-                printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+                bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
-                return NULL;
-        }
-        memset(bip, 0, sizeof(*bip));
+        /* Use mempool if lower order alloc failed or max vecs were requested */
+        if (bip == NULL) {
+                bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-        iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set);
+                if (unlikely(bip == NULL)) {
-        if (unlikely(iv == NULL)) {
+                        printk(KERN_ERR "%s: could not alloc bip\n", __func__);
-                printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+                        return NULL;
-                mempool_free(bip, bio_integrity_pool);
+                }
-                return NULL;
        }
-        bip->bip_pool = idx;
+        memset(bip, 0, sizeof(*bip));
-        bip->bip_vec = iv;
+        bip->bip_slab = idx;
        bip->bip_bio = bio;
        bio->bi_integrity = bip;
        return bip;
 }
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+                                                  gfp_t gfp_mask,
+                                                  unsigned int nr_vecs)
+{
+        return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
 EXPORT_SYMBOL(bio_integrity_alloc);
 /**
 * bio_integrity_free - Free bio integrity payload
 * @bio:        bio containing bip to be freed
+ * @bs:         bio_set this bio was allocated from
 *
 * Description: Used to free the integrity portion of a bio. Usually
 * called from bio_free().
 */
-void bio_integrity_free(struct bio *bio)
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 {
        struct bio_integrity_payload *bip = bio->bi_integrity;
@@ -92,8 +150,10 @@ void bio_integrity_free(struct bio *bio)
            && bip->bip_buf != NULL)
                kfree(bip->bip_buf);
-        bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool);
+        if (use_bip_pool(bip->bip_slab))
-        mempool_free(bip, bio_integrity_pool);
+                mempool_free(bip, bs->bio_integrity_pool);
+        else
+                kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
        bio->bi_integrity = NULL;
 }
@@ -114,7 +174,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct bio_vec *iv;
-        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
                printk(KERN_ERR "%s: bip_vec full\n", __func__);
                return 0;
        }
@@ -647,8 +707,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
        bp->iv1 = bip->bip_vec[0];
        bp->iv2 = bip->bip_vec[0];
-        bp->bip1.bip_vec = &bp->iv1;
+        bp->bip1.bip_vec[0] = bp->iv1;
-        bp->bip2.bip_vec = &bp->iv2;
+        bp->bip2.bip_vec[0] = bp->iv2;
        bp->iv1.bv_len = sectors * bi->tuple_size;
        bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -667,17 +727,19 @@ EXPORT_SYMBOL(bio_integrity_split);
 * @bio:        New bio
 * @bio_src:    Original bio
 * @gfp_mask:   Memory allocation mask
+ * @bs:         bio_set to allocate bip from
 *
 * Description: Called to allocate a bip when cloning a bio
 */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+                        gfp_t gfp_mask, struct bio_set *bs)
 {
        struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
        struct bio_integrity_payload *bip;
        BUG_ON(bip_src == NULL);
-        bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
+        bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
        if (bip == NULL)
                return -EIO;
@@ -693,25 +755,43 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_integrity_clone);
-static int __init bio_integrity_init(void)
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-        kintegrityd_wq = create_workqueue("kintegrityd");
+        unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
+        bs->bio_integrity_pool =
+                mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
+        if (!bs->bio_integrity_pool)
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+void bioset_integrity_free(struct bio_set *bs)
+{
+        if (bs->bio_integrity_pool)
+                mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+void __init bio_integrity_init(void)
+{
+        unsigned int i;
+        kintegrityd_wq = create_workqueue("kintegrityd");
        if (!kintegrityd_wq)
                panic("Failed to create kintegrityd\n");
-        bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+        for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
-                                        SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+                unsigned int size;
-        bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE,
+                size = sizeof(struct bio_integrity_payload)
-                                                      bio_integrity_slab);
+                        + bip_slab[i].nr_vecs * sizeof(struct bio_vec);
-        if (!bio_integrity_pool)
-                panic("bio_integrity: can't allocate bip pool\n");
-        integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+                bip_slab[i].slab =
-        if (!integrity_bio_set)
+                        kmem_cache_create(bip_slab[i].name, size, 0,
-                panic("bio_integrity: can't allocate bio_set\n");
+                                          SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+        }
-        return 0;
 }
-subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 98711647ece4..76738005c8e8 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,11 +25,9 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 /*
 * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -240,7 +238,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
                bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
        if (bio_integrity(bio))
-                bio_integrity_free(bio);
+                bio_integrity_free(bio, bs);
        /*
         * If we have front padding, adjust the bio pointer before freeing
@@ -343,7 +341,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 static void bio_kmalloc_destructor(struct bio *bio)
 {
        if (bio_integrity(bio))
-                bio_integrity_free(bio);
+                bio_integrity_free(bio, fs_bio_set);
        kfree(bio);
 }
@@ -359,9 +357,9 @@ static void bio_kmalloc_destructor(struct bio *bio)
 *
 *   If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
 *   a bio. This is due to the mempool guarantees. To make this work, callers
- *   must never allocate more than 1 bio at the time from this pool. Callers
+ *   must never allocate more than 1 bio at a time from this pool. Callers
 *   that need to allocate more than 1 bio must always submit the previously
- *   allocate bio for IO before attempting to allocate a new one. Failure to
+ *   allocated bio for IO before attempting to allocate a new one. Failure to
 *   do so can cause livelocks under memory pressure.
 *
 **/
@@ -474,7 +472,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
        if (bio_integrity(bio)) {
                int ret;
-                ret = bio_integrity_clone(b, bio, gfp_mask);
+                ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
                if (ret < 0) {
                        bio_put(b);
@@ -499,11 +497,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
        struct request_queue *q = bdev_get_queue(bdev);
        int nr_pages;
-        nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        if (nr_pages > q->max_phys_segments)
+        if (nr_pages > queue_max_phys_segments(q))
-                nr_pages = q->max_phys_segments;
+                nr_pages = queue_max_phys_segments(q);
-        if (nr_pages > q->max_hw_segments)
+        if (nr_pages > queue_max_hw_segments(q))
-                nr_pages = q->max_hw_segments;
+                nr_pages = queue_max_hw_segments(q);
        return nr_pages;
 }
@@ -562,8 +560,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         * make this too complex.
         */
-        while (bio->bi_phys_segments >= q->max_phys_segments
+        while (bio->bi_phys_segments >= queue_max_phys_segments(q)
-               || bio->bi_phys_segments >= q->max_hw_segments) {
+               || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
                if (retried_segments)
                        return 0;
@@ -634,7 +632,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
                    unsigned int len, unsigned int offset)
 {
-        return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
+        return __bio_add_page(q, bio, page, len, offset,
+                              queue_max_hw_sectors(q));
 }
 /**
@@ -654,7 +653,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
                 unsigned int offset)
 {
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-        return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
+        return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
 }
 struct bio_map_data {
@@ -706,14 +705,13 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
 }
 static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
-                          struct sg_iovec *iov, int iov_count, int uncopy,
+                          struct sg_iovec *iov, int iov_count,
-                          int do_free_page)
+                          int to_user, int from_user, int do_free_page)
 {
        int ret = 0, i;
        struct bio_vec *bvec;
        int iov_idx = 0;
        unsigned int iov_off = 0;
-        int read = bio_data_dir(bio) == READ;
        __bio_for_each_segment(bvec, bio, i, 0) {
                char *bv_addr = page_address(bvec->bv_page);
@@ -721,20 +719,21 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
                while (bv_len && iov_idx < iov_count) {
                        unsigned int bytes;
-                        char *iov_addr;
+                        char __user *iov_addr;
                        bytes = min_t(unsigned int,
                                      iov[iov_idx].iov_len - iov_off, bv_len);
                        iov_addr = iov[iov_idx].iov_base + iov_off;
                        if (!ret) {
-                                if (!read && !uncopy)
+                                if (to_user)
-                                        ret = copy_from_user(bv_addr, iov_addr,
-                                                             bytes);
-                                if (read && uncopy)
                                        ret = copy_to_user(iov_addr, bv_addr,
                                                           bytes);
+                                if (from_user)
+                                        ret = copy_from_user(bv_addr, iov_addr,
+                                                             bytes);
                                if (ret)
                                        ret = -EFAULT;
                        }
@@ -771,7 +770,8 @@ int bio_uncopy_user(struct bio *bio)
        if (!bio_flagged(bio, BIO_NULL_MAPPED))
                ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
-                                     bmd->nr_sgvecs, 1, bmd->is_our_pages);
+                                     bmd->nr_sgvecs, bio_data_dir(bio) == READ,
+                                     0, bmd->is_our_pages);
        bio_free_map_data(bmd);
        bio_put(bio);
        return ret;
@@ -876,8 +876,9 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        /*
         * success
         */
-        if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
+        if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
-                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
+            (map_data && map_data->from_user)) {
+                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
                if (ret)
                        goto cleanup;
        }
@@ -1201,7 +1202,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
                char *addr = page_address(bvec->bv_page);
                int len = bmd->iovecs[i].bv_len;
-                if (read && !err)
+                if (read)
                        memcpy(p, addr, len);
                __free_page(bvec->bv_page);
@@ -1490,11 +1491,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 sector_t bio_sector_offset(struct bio *bio, unsigned short index,
                           unsigned int offset)
 {
-        unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+        unsigned int sector_sz;
        struct bio_vec *bv;
        sector_t sectors;
        int i;
+        sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
        sectors = 0;
        if (index >= bio->bi_idx)
@@ -1539,6 +1541,7 @@ void bioset_free(struct bio_set *bs)
        if (bs->bio_pool)
                mempool_destroy(bs->bio_pool);
+        bioset_integrity_free(bs);
        biovec_free_pools(bs);
        bio_put_slab(bs);
@@ -1579,6 +1582,9 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
        if (!bs->bio_pool)
                goto bad;
+        if (bioset_integrity_create(bs, pool_size))
+                goto bad;
        if (!biovec_create_pools(bs, pool_size))
                return bs;
@@ -1616,6 +1622,7 @@ static int __init init_bio(void)
        if (!bio_slabs)
                panic("bio: can't allocate bios\n");
+        bio_integrity_init();
        biovec_init_slabs();
        fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd17..94dfda24c06e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
+#include <linux/kmemleak.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -76,7 +77,7 @@ int set_blocksize(struct block_device *bdev, int size)
                return -EINVAL;
        /* Size cannot be smaller than the size supported by the device */
-        if (size < bdev_hardsect_size(bdev))
+        if (size < bdev_logical_block_size(bdev))
                return -EINVAL;
        /* Don't change the size if it is same as current */
@@ -106,7 +107,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
 int sb_min_blocksize(struct super_block *sb, int size)
 {
-        int minsize = bdev_hardsect_size(sb->s_bdev);
+        int minsize = bdev_logical_block_size(sb->s_bdev);
        if (size < minsize)
                size = minsize;
        return sb_set_blocksize(sb, size);
@@ -175,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                                iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+        if (!bdev)
+                return 0;
+        if (!wait)
+                return filemap_flush(bdev->bd_inode->i_mapping);
+        return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
 /*
 * Write out and wait upon all the dirty data associated with a block
 * device via its mapping.  Does not take the superblock lock.
 */
 int sync_blockdev(struct block_device *bdev)
 {
-        int ret = 0;
+        return __sync_blockdev(bdev, 1);
-        if (bdev)
-                ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-        return ret;
 }
 EXPORT_SYMBOL(sync_blockdev);
@@ -198,7 +204,7 @@ int fsync_bdev(struct block_device *bdev)
 {
        struct super_block *sb = get_super(bdev);
        if (sb) {
-                int res = fsync_super(sb);
+                int res = sync_filesystem(sb);
                drop_super(sb);
                return res;
        }
@@ -240,7 +246,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
                sb->s_frozen = SB_FREEZE_WRITE;
                smp_wmb();
-                __fsync_super(sb);
+                sync_filesystem(sb);
                sb->s_frozen = SB_FREEZE_TRANS;
                smp_wmb();
@@ -492,6 +498,11 @@ void __init bdev_cache_init(void)
        bd_mnt = kern_mount(&bd_type);
        if (IS_ERR(bd_mnt))
                panic("Cannot create bdev pseudo-fs");
+        /*
+         * This vfsmount structure is only used to obtain the
+         * blockdev_superblock, so tell kmemleak not to report it.
+         */
+        kmemleak_not_leak(bd_mnt);
        blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
 }
@@ -553,6 +564,16 @@ struct block_device *bdget(dev_t dev)
 EXPORT_SYMBOL(bdget);
+/**
+ * bdgrab -- Grab a reference to an already referenced block device
+ * @bdev:       Block device to grab a reference to.
+ */
+struct block_device *bdgrab(struct block_device *bdev)
+{
+        atomic_inc(&bdev->bd_inode->i_count);
+        return bdev;
+}
 long nr_blockdev_pages(void)
 {
        struct block_device *bdev;
@@ -1111,7 +1132,7 @@ EXPORT_SYMBOL(check_disk_change);
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-        unsigned bsize = bdev_hardsect_size(bdev);
+        unsigned bsize = bdev_logical_block_size(bdev);
        bdev->bd_inode->i_size = size;
        while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 94212844a9bc..a35eb36b32fd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+           export.o tree-log.o acl.o free-space-cache.o zlib.o \
-           compression.o delayed-ref.o
+           compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index cbba000dccbe..f128427b995b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -29,51 +29,28 @@
 #ifdef CONFIG_FS_POSIX_ACL
-static void btrfs_update_cached_acl(struct inode *inode,
-                                    struct posix_acl **p_acl,
-                                    struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
-                posix_acl_release(*p_acl);
-        *p_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 {
        int size;
        const char *name;
        char *value = NULL;
-        struct posix_acl *acl = NULL, **p_acl;
+        struct posix_acl *acl;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &BTRFS_I(inode)->i_acl;
                break;
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &BTRFS_I(inode)->i_default_acl;
                break;
        default:
-                return ERR_PTR(-EINVAL);
+                BUG();
        }
-        /* Handle the cached NULL acl case without locking */
-        acl = ACCESS_ONCE(*p_acl);
-        if (!acl)
-                return acl;
-        spin_lock(&inode->i_lock);
-        acl = *p_acl;
-        if (acl != BTRFS_ACL_NOT_CACHED)
-                acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-        if (acl != BTRFS_ACL_NOT_CACHED)
-                return acl;
        size = __btrfs_getxattr(inode, name, "", 0);
        if (size > 0) {
                value = kzalloc(size, GFP_NOFS);
@@ -82,13 +59,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
-                        btrfs_update_cached_acl(inode, p_acl, acl);
+                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
        } else if (size == -ENOENT || size == -ENODATA || size == 0) {
                /* FIXME, who returns -ENOENT?  I think nobody */
                acl = NULL;
-                btrfs_update_cached_acl(inode, p_acl, acl);
+                set_cached_acl(inode, type, acl);
        } else {
                acl = ERR_PTR(-EIO);
        }
@@ -121,7 +98,6 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
        int ret, size = 0;
        const char *name;
-        struct posix_acl **p_acl;
        char *value = NULL;
        mode_t mode;
@@ -141,13 +117,11 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
                ret = 0;
                inode->i_mode = mode;
                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &BTRFS_I(inode)->i_acl;
                break;
        case ACL_TYPE_DEFAULT:
                if (!S_ISDIR(inode->i_mode))
                        return acl ? -EINVAL : 0;
                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &BTRFS_I(inode)->i_default_acl;
                break;
        default:
                return -EINVAL;
@@ -172,7 +146,7 @@ out:
        kfree(value);
        if (!ret)
-                btrfs_update_cached_acl(inode, p_acl, acl);
+                set_cached_acl(inode, type, acl);
        return ret;
 }
@@ -351,9 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
        return 0;
 }
-int btrfs_check_acl(struct inode *inode, int mask)
-{
-        return 0;
-}
 #endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 502c3d61de62..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -294,13 +294,13 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
                atomic_set(&worker->num_pending, 0);
+                worker->workers = workers;
                worker->task = kthread_run(worker_loop, worker,
                                           "btrfs-%s-%d", workers->name,
                                           workers->num_workers + i);
-                worker->workers = workers;
                if (IS_ERR(worker->task)) {
-                        kfree(worker);
                        ret = PTR_ERR(worker->task);
+                        kfree(worker);
                        goto fail;
                }
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
         * list
         */
        if (worker->idle) {
-                spin_lock_irqsave(&worker->workers->lock, flags);
+                spin_lock(&worker->workers->lock);
                worker->idle = 0;
                list_move_tail(&worker->worker_list,
                               &worker->workers->worker_list);
-                spin_unlock_irqrestore(&worker->workers->lock, flags);
+                spin_unlock(&worker->workers->lock);
        }
        if (!worker->working) {
                wake = 1;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b30986f00b9d..ea1ea0af8c0e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -53,10 +53,6 @@ struct btrfs_inode {
        /* used to order data wrt metadata */
        struct btrfs_ordered_inode_tree ordered_tree;
-        /* standard acl pointers */
-        struct posix_acl *i_acl;
-        struct posix_acl *i_default_acl;
        /* for keeping track of orphaned inodes */
        struct list_head i_orphan;
@@ -72,6 +68,9 @@ struct btrfs_inode {
         */
        struct list_head ordered_operations;
+        /* node for the red-black tree that links inodes in subvolume root */
+        struct rb_node rb_node;
        /* the space_info for where this inode's data allocations are done */
        struct btrfs_space_info *space_info;
@@ -154,5 +153,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
        BTRFS_I(inode)->disk_i_size = size;
 }
 #endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ab07627084f1..9d8ba4d54a37 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -26,7 +26,6 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
 #include <linux/swap.h>
@@ -123,7 +122,7 @@ static int check_compressed_csum(struct inode *inode,
        u32 csum;
        u32 *cb_sum = &cb->sums;
-        if (btrfs_test_flag(inode, NODATASUM))
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                return 0;
        for (i = 0; i < cb->nr_pages; i++) {
@@ -670,7 +669,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                         */
                        atomic_inc(&cb->pending_bios);
-                        if (!btrfs_test_flag(inode, NODATASUM)) {
+                        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                                btrfs_lookup_bio_sums(root, inode, comp_bio,
                                                      sums);
                        }
@@ -697,7 +696,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
        BUG_ON(ret);
-        if (!btrfs_test_flag(inode, NODATASUM))
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
                btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
deleted file mode 100644
index 6e1b3de36700..000000000000
--- a/fs/btrfs/crc32c.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef __BTRFS_CRC32C__
-#define __BTRFS_CRC32C__
-#include <linux/crc32c.h>
-/*
- * this file used to do more for selecting the HW version of crc32c,
- * perhaps it will one day again soon.
- */
-#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
-#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fedf8b9f03a2..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        u32 nritems;
        int ret = 0;
        int level;
-        struct btrfs_root *new_root;
+        struct btrfs_disk_key disk_key;
-        new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
-        if (!new_root)
-                return -ENOMEM;
-        memcpy(new_root, root, sizeof(*new_root));
-        new_root->root_key.objectid = new_root_objectid;
        WARN_ON(root->ref_cows && trans->transid !=
                root->fs_info->running_transaction->transid);
@@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        level = btrfs_header_level(buf);
        nritems = btrfs_header_nritems(buf);
+        if (level == 0)
+                btrfs_item_key(buf, &disk_key, 0);
+        else
+                btrfs_node_key(buf, &disk_key, 0);
-        cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
-                                     new_root_objectid, trans->transid,
+                                     new_root_objectid, &disk_key, level,
-                                     level, buf->start, 0);
+                                     buf->start, 0);
-        if (IS_ERR(cow)) {
+        if (IS_ERR(cow))
-                kfree(new_root);
                return PTR_ERR(cow);
-        }
        copy_extent_buffer(cow, buf, 0, 0, cow->len);
        btrfs_set_header_bytenr(cow, cow->start);
        btrfs_set_header_generation(cow, trans->transid);
-        btrfs_set_header_owner(cow, new_root_objectid);
+        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
-        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+                                     BTRFS_HEADER_FLAG_RELOC);
+        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+        else
+                btrfs_set_header_owner(cow, new_root_objectid);
        write_extent_buffer(cow, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
-        ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
+        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-        kfree(new_root);
+                ret = btrfs_inc_ref(trans, root, cow, 1);
+        else
+                ret = btrfs_inc_ref(trans, root, cow, 0);
        if (ret)
                return ret;
@@ -244,6 +246,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 }
 /*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+                              struct extent_buffer *buf)
+{
+        /*
+         * Tree blocks not in refernece counted trees and tree roots
+         * are never shared. If a block was allocated after the last
+         * snapshot and the block was not allocated by tree relocation,
+         * we know the block is not shared.
+         */
+        if (root->ref_cows &&
+            buf != root->node && buf != root->commit_root &&
+            (btrfs_header_generation(buf) <=
+             btrfs_root_last_snapshot(&root->root_item) ||
+             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+                return 1;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (root->ref_cows &&
+            btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+                return 1;
+#endif
+        return 0;
+}
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct extent_buffer *buf,
+                                       struct extent_buffer *cow)
+{
+        u64 refs;
+        u64 owner;
+        u64 flags;
+        u64 new_flags = 0;
+        int ret;
+        /*
+         * Backrefs update rules:
+         *
+         * Always use full backrefs for extent pointers in tree block
+         * allocated by tree relocation.
+         *
+         * If a shared tree block is no longer referenced by its owner
+         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+         * use full backrefs for extent pointers in tree block.
+         *
+         * If a tree block is been relocating
+         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+         * use full backrefs for extent pointers in tree block.
+         * The reason for this is some operations (such as drop tree)
+         * are only allowed for blocks use full backrefs.
+         */
+        if (btrfs_block_can_be_shared(root, buf)) {
+                ret = btrfs_lookup_extent_info(trans, root, buf->start,
+                                               buf->len, &refs, &flags);
+                BUG_ON(ret);
+                BUG_ON(refs == 0);
+        } else {
+                refs = 1;
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+                else
+                        flags = 0;
+        }
+        owner = btrfs_header_owner(buf);
+        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+        if (refs > 1) {
+                if ((owner == root->root_key.objectid ||
+                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+                        ret = btrfs_inc_ref(trans, root, buf, 1);
+                        BUG_ON(ret);
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID) {
+                                ret = btrfs_dec_ref(trans, root, buf, 0);
+                                BUG_ON(ret);
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                                BUG_ON(ret);
+                        }
+                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+                } else {
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID)
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                        else
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
+                        BUG_ON(ret);
+                }
+                if (new_flags != 0) {
+                        ret = btrfs_set_disk_extent_flags(trans, root,
+                                                          buf->start,
+                                                          buf->len,
+                                                          new_flags, 0);
+                        BUG_ON(ret);
+                }
+        } else {
+                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID)
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                        else
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_dec_ref(trans, root, buf, 1);
+                        BUG_ON(ret);
+                }
+                clean_tree_block(trans, root, buf);
+        }
+        return 0;
+}
+/*
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
@@ -262,34 +383,39 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                             struct extent_buffer **cow_ret,
                             u64 search_start, u64 empty_size)
 {
-        u64 parent_start;
+        struct btrfs_disk_key disk_key;
        struct extent_buffer *cow;
-        u32 nritems;
-        int ret = 0;
        int level;
        int unlock_orig = 0;
+        u64 parent_start;
        if (*cow_ret == buf)
                unlock_orig = 1;
        btrfs_assert_tree_locked(buf);
-        if (parent)
-                parent_start = parent->start;
-        else
-                parent_start = 0;
        WARN_ON(root->ref_cows && trans->transid !=
                root->fs_info->running_transaction->transid);
        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
        level = btrfs_header_level(buf);
-        nritems = btrfs_header_nritems(buf);
-        cow = btrfs_alloc_free_block(trans, root, buf->len,
+        if (level == 0)
-                                     parent_start, root->root_key.objectid,
+                btrfs_item_key(buf, &disk_key, 0);
-                                     trans->transid, level,
+        else
-                                     search_start, empty_size);
+                btrfs_node_key(buf, &disk_key, 0);
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent)
+                        parent_start = parent->start;
+                else
+                        parent_start = 0;
+        } else
+                parent_start = 0;
+        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
+                                     root->root_key.objectid, &disk_key,
+                                     level, search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
@@ -298,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        copy_extent_buffer(cow, buf, 0, 0, cow->len);
        btrfs_set_header_bytenr(cow, cow->start);
        btrfs_set_header_generation(cow, trans->transid);
-        btrfs_set_header_owner(cow, root->root_key.objectid);
+        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
-        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+                                     BTRFS_HEADER_FLAG_RELOC);
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+        else
+                btrfs_set_header_owner(cow, root->root_key.objectid);
        write_extent_buffer(cow, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
-        WARN_ON(btrfs_header_generation(buf) > trans->transid);
+        update_ref_for_cow(trans, root, buf, cow);
-        if (btrfs_header_generation(buf) != trans->transid) {
-                u32 nr_extents;
-                ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
-                if (ret)
-                        return ret;
-                ret = btrfs_cache_ref(trans, root, buf, nr_extents);
-                WARN_ON(ret);
-        } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
-                /*
-                 * There are only two places that can drop reference to
-                 * tree blocks owned by living reloc trees, one is here,
-                 * the other place is btrfs_drop_subtree. In both places,
-                 * we check reference count while tree block is locked.
-                 * Furthermore, if reference count is one, it won't get
-                 * increased by someone else.
-                 */
-                u32 refs;
-                ret = btrfs_lookup_extent_ref(trans, root, buf->start,
-                                              buf->len, &refs);
-                BUG_ON(ret);
-                if (refs == 1) {
-                        ret = btrfs_update_ref(trans, root, buf, cow,
-                                               0, nritems);
-                        clean_tree_block(trans, root, buf);
-                } else {
-                        ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
-                }
-                BUG_ON(ret);
-        } else {
-                ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
-                if (ret)
-                        return ret;
-                clean_tree_block(trans, root, buf);
-        }
-        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-                ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
-                WARN_ON(ret);
-        }
        if (buf == root->node) {
                WARN_ON(parent && parent != buf);
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+                        parent_start = buf->start;
+                else
+                        parent_start = 0;
                spin_lock(&root->node_lock);
                root->node = cow;
                extent_buffer_get(cow);
                spin_unlock(&root->node_lock);
-                if (buf != root->commit_root) {
+                btrfs_free_extent(trans, root, buf->start, buf->len,
-                        btrfs_free_extent(trans, root, buf->start,
+                                  parent_start, root->root_key.objectid,
-                                          buf->len, buf->start,
+                                  level, 0);
-                                          root->root_key.objectid,
-                                          btrfs_header_generation(buf),
-                                          level, 1);
-                }
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                        parent_start = parent->start;
+                else
+                        parent_start = 0;
+                WARN_ON(trans->transid != btrfs_header_generation(parent));
                btrfs_set_node_blockptr(parent, parent_slot,
                                        cow->start);
-                WARN_ON(trans->transid == 0);
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                WARN_ON(btrfs_header_generation(parent) != trans->transid);
                btrfs_free_extent(trans, root, buf->start, buf->len,
-                                  parent_start, btrfs_header_owner(parent),
+                                  parent_start, root->root_key.objectid,
-                                  btrfs_header_generation(parent), level, 1);
+                                  level, 0);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -384,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        return 0;
 }
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct extent_buffer *buf)
+{
+        if (btrfs_header_generation(buf) == trans->transid &&
+            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+                return 0;
+        return 1;
+}
 /*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
@@ -411,9 +519,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
                WARN_ON(1);
        }
-        if (btrfs_header_generation(buf) == trans->transid &&
+        if (!should_cow_block(trans, root, buf)) {
-            btrfs_header_owner(buf) == root->root_key.objectid &&
-            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                *cow_ret = buf;
                return 0;
        }
@@ -451,25 +557,13 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
        btrfs_disk_key_to_cpu(&k1, disk);
-        if (k1.objectid > k2->objectid)
+        return btrfs_comp_cpu_keys(&k1, k2);
-                return 1;
-        if (k1.objectid < k2->objectid)
-                return -1;
-        if (k1.type > k2->type)
-                return 1;
-        if (k1.type < k2->type)
-                return -1;
-        if (k1.offset > k2->offset)
-                return 1;
-        if (k1.offset < k2->offset)
-                return -1;
-        return 0;
 }
 /*
 * same as comp_keys only with two btrfs_key's
 */
-static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
 {
        if (k1->objectid > k2->objectid)
                return 1;
@@ -845,6 +939,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
        return -1;
 }
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+                     int level, int *slot)
+{
+        return bin_search(eb, key, level, slot);
+}
 /* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
@@ -921,13 +1021,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                root->node = child;
                spin_unlock(&root->node_lock);
-                ret = btrfs_update_extent_ref(trans, root, child->start,
-                                              child->len,
-                                              mid->start, child->start,
-                                              root->root_key.objectid,
-                                              trans->transid, level - 1);
-                BUG_ON(ret);
                add_root_to_dirty_list(root);
                btrfs_tree_unlock(child);
@@ -938,9 +1031,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                /* once for the path */
                free_extent_buffer(mid);
                ret = btrfs_free_extent(trans, root, mid->start, mid->len,
-                                        mid->start, root->root_key.objectid,
+                                        0, root->root_key.objectid, level, 1);
-                                        btrfs_header_generation(mid),
-                                        level, 1);
                /* once for the root ptr */
                free_extent_buffer(mid);
                return ret;
@@ -949,10 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
-        if (trans->transaction->delayed_refs.flushing &&
-            btrfs_header_nritems(mid) > 2)
-                return 0;
        if (btrfs_header_nritems(mid) < 2)
                err_on_enospc = 1;
@@ -998,7 +1085,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
                        u64 bytenr = right->start;
-                        u64 generation = btrfs_header_generation(parent);
                        u32 blocksize = right->len;
                        clean_tree_block(trans, root, right);
@@ -1010,9 +1096,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        if (wret)
                                ret = wret;
                        wret = btrfs_free_extent(trans, root, bytenr,
-                                                 blocksize, parent->start,
+                                                 blocksize, 0,
-                                                 btrfs_header_owner(parent),
+                                                 root->root_key.objectid,
-                                                 generation, level, 1);
+                                                 level, 0);
                        if (wret)
                                ret = wret;
                } else {
@@ -1047,7 +1133,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        }
        if (btrfs_header_nritems(mid) == 0) {
                /* we've managed to empty the middle node, drop it */
-                u64 root_gen = btrfs_header_generation(parent);
                u64 bytenr = mid->start;
                u32 blocksize = mid->len;
@@ -1059,9 +1144,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret)
                        ret = wret;
                wret = btrfs_free_extent(trans, root, bytenr, blocksize,
-                                         parent->start,
+                                         0, root->root_key.objectid,
-                                         btrfs_header_owner(parent),
+                                         level, 0);
-                                         root_gen, level, 1);
                if (wret)
                        ret = wret;
        } else {
@@ -1437,7 +1521,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
        int i;
-        if (path->keep_locks || path->lowest_level)
+        if (path->keep_locks)
                return;
        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1552,7 +1636,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                }
                b = p->nodes[level];
        } else if (ins_len < 0 && btrfs_header_nritems(b) <
-                   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+                   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
                int sret;
                sret = reada_for_balance(root, p, level);
@@ -1602,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct extent_buffer *b;
        int slot;
        int ret;
+        int err;
        int level;
        int lowest_unlock = 1;
        u8 lowest_level = 0;
@@ -1614,10 +1699,17 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                lowest_unlock = 2;
 again:
-        if (p->skip_locking)
+        if (p->search_commit_root) {
-                b = btrfs_root_node(root);
+                b = root->commit_root;
-        else
+                extent_buffer_get(b);
-                b = btrfs_lock_root_node(root);
+                if (!p->skip_locking)
+                        btrfs_tree_lock(b);
+        } else {
+                if (p->skip_locking)
+                        b = btrfs_root_node(root);
+                else
+                        b = btrfs_lock_root_node(root);
+        }
        while (b) {
                level = btrfs_header_level(b);
@@ -1631,26 +1723,22 @@ again:
                        p->locks[level] = 1;
                if (cow) {
-                        int wret;
                        /*
                         * if we don't really need to cow this block
                         * then we don't want to set the path blocking,
                         * so we test it here
                         */
-                        if (btrfs_header_generation(b) == trans->transid &&
+                        if (!should_cow_block(trans, root, b))
-                            btrfs_header_owner(b) == root->root_key.objectid &&
-                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
                                goto cow_done;
-                        }
                        btrfs_set_path_blocking(p);
-                        wret = btrfs_cow_block(trans, root, b,
+                        err = btrfs_cow_block(trans, root, b,
-                                               p->nodes[level + 1],
+                                              p->nodes[level + 1],
-                                               p->slots[level + 1], &b);
+                                              p->slots[level + 1], &b);
-                        if (wret) {
+                        if (err) {
                                free_extent_buffer(b);
-                                ret = wret;
+                                ret = err;
                                goto done;
                        }
                }
@@ -1689,41 +1777,45 @@ cow_done:
                ret = bin_search(b, key, level, &slot);
                if (level != 0) {
-                        if (ret && slot > 0)
+                        int dec = 0;
+                        if (ret && slot > 0) {
+                                dec = 1;
                                slot -= 1;
+                        }
                        p->slots[level] = slot;
-                        ret = setup_nodes_for_search(trans, root, p, b, level,
+                        err = setup_nodes_for_search(trans, root, p, b, level,
                                                     ins_len);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
-                        else if (ret)
+                        if (err) {
+                                ret = err;
                                goto done;
+                        }
                        b = p->nodes[level];
                        slot = p->slots[level];
                        unlock_up(p, level, lowest_unlock);
-                        /* this is only true while dropping a snapshot */
                        if (level == lowest_level) {
-                                ret = 0;
+                                if (dec)
+                                        p->slots[level]++;
                                goto done;
                        }
-                        ret = read_block_for_search(trans, root, p,
+                        err = read_block_for_search(trans, root, p,
                                                    &b, level, slot, key);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
+                        if (err) {
-                        if (ret == -EIO)
+                                ret = err;
                                goto done;
+                        }
                        if (!p->skip_locking) {
-                                int lret;
                                btrfs_clear_path_blocking(p, NULL);
-                                lret = btrfs_try_spin_lock(b);
+                                err = btrfs_try_spin_lock(b);
-                                if (!lret) {
+                                if (!err) {
                                        btrfs_set_path_blocking(p);
                                        btrfs_tree_lock(b);
                                        btrfs_clear_path_blocking(p, b);
@@ -1733,16 +1825,14 @@ cow_done:
                        p->slots[level] = slot;
                        if (ins_len > 0 &&
                            btrfs_leaf_free_space(root, b) < ins_len) {
-                                int sret;
                                btrfs_set_path_blocking(p);
-                                sret = split_leaf(trans, root, key,
+                                err = split_leaf(trans, root, key,
-                                                      p, ins_len, ret == 0);
+                                                 p, ins_len, ret == 0);
                                btrfs_clear_path_blocking(p, NULL);
-                                BUG_ON(sret > 0);
+                                BUG_ON(err > 0);
-                                if (sret) {
+                                if (err) {
-                                        ret = sret;
+                                        ret = err;
                                        goto done;
                                }
                        }
@@ -1764,138 +1854,6 @@ done:
        return ret;
 }
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     struct btrfs_key *node_keys,
-                     u64 *nodes, int lowest_level)
-{
-        struct extent_buffer *eb;
-        struct extent_buffer *parent;
-        struct btrfs_key key;
-        u64 bytenr;
-        u64 generation;
-        u32 blocksize;
-        int level;
-        int slot;
-        int key_match;
-        int ret;
-        eb = btrfs_lock_root_node(root);
-        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
-        BUG_ON(ret);
-        btrfs_set_lock_blocking(eb);
-        parent = eb;
-        while (1) {
-                level = btrfs_header_level(parent);
-                if (level == 0 || level <= lowest_level)
-                        break;
-                ret = bin_search(parent, &node_keys[lowest_level], level,
-                                 &slot);
-                if (ret && slot > 0)
-                        slot--;
-                bytenr = btrfs_node_blockptr(parent, slot);
-                if (nodes[level - 1] == bytenr)
-                        break;
-                blocksize = btrfs_level_size(root, level - 1);
-                generation = btrfs_node_ptr_generation(parent, slot);
-                btrfs_node_key_to_cpu(eb, &key, slot);
-                key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
-                if (generation == trans->transid) {
-                        eb = read_tree_block(root, bytenr, blocksize,
-                                             generation);
-                        btrfs_tree_lock(eb);
-                        btrfs_set_lock_blocking(eb);
-                }
-                /*
-                 * if node keys match and node pointer hasn't been modified
-                 * in the running transaction, we can merge the path. for
-                 * blocks owened by reloc trees, the node pointer check is
-                 * skipped, this is because these blocks are fully controlled
-                 * by the space balance code, no one else can modify them.
-                 */
-                if (!nodes[level - 1] || !key_match ||
-                    (generation == trans->transid &&
-                     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
-                        if (level == 1 || level == lowest_level + 1) {
-                                if (generation == trans->transid) {
-                                        btrfs_tree_unlock(eb);
-                                        free_extent_buffer(eb);
-                                }
-                                break;
-                        }
-                        if (generation != trans->transid) {
-                                eb = read_tree_block(root, bytenr, blocksize,
-                                                generation);
-                                btrfs_tree_lock(eb);
-                                btrfs_set_lock_blocking(eb);
-                        }
-                        ret = btrfs_cow_block(trans, root, eb, parent, slot,
-                                              &eb);
-                        BUG_ON(ret);
-                        if (root->root_key.objectid ==
-                            BTRFS_TREE_RELOC_OBJECTID) {
-                                if (!nodes[level - 1]) {
-                                        nodes[level - 1] = eb->start;
-                                        memcpy(&node_keys[level - 1], &key,
-                                               sizeof(node_keys[0]));
-                                } else {
-                                        WARN_ON(1);
-                                }
-                        }
-                        btrfs_tree_unlock(parent);
-                        free_extent_buffer(parent);
-                        parent = eb;
-                        continue;
-                }
-                btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
-                btrfs_set_node_ptr_generation(parent, slot, trans->transid);
-                btrfs_mark_buffer_dirty(parent);
-                ret = btrfs_inc_extent_ref(trans, root,
-                                        nodes[level - 1],
-                                        blocksize, parent->start,
-                                        btrfs_header_owner(parent),
-                                        btrfs_header_generation(parent),
-                                        level - 1);
-                BUG_ON(ret);
-                /*
-                 * If the block was created in the running transaction,
-                 * it's possible this is the last reference to it, so we
-                 * should drop the subtree.
-                 */
-                if (generation == trans->transid) {
-                        ret = btrfs_drop_subtree(trans, root, eb, parent);
-                        BUG_ON(ret);
-                        btrfs_tree_unlock(eb);
-                        free_extent_buffer(eb);
-                } else {
-                        ret = btrfs_free_extent(trans, root, bytenr,
-                                        blocksize, parent->start,
-                                        btrfs_header_owner(parent),
-                                        btrfs_header_generation(parent),
-                                        level - 1, 1);
-                        BUG_ON(ret);
-                }
-                break;
-        }
-        btrfs_tree_unlock(parent);
-        free_extent_buffer(parent);
-        return 0;
-}
 /*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
@@ -2021,9 +1979,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(src);
        btrfs_mark_buffer_dirty(dst);
-        ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
-        BUG_ON(ret);
        return ret;
 }
@@ -2083,9 +2038,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(src);
        btrfs_mark_buffer_dirty(dst);
-        ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
-        BUG_ON(ret);
        return ret;
 }
@@ -2105,7 +2057,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        struct extent_buffer *c;
        struct extent_buffer *old;
        struct btrfs_disk_key lower_key;
-        int ret;
        BUG_ON(path->nodes[level]);
        BUG_ON(path->nodes[level-1] != root->node);
@@ -2117,16 +2068,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
                btrfs_node_key(lower, &lower_key, 0);
        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-                                   root->root_key.objectid, trans->transid,
+                                   root->root_key.objectid, &lower_key,
                                   level, root->node->start, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
-        memset_extent_buffer(c, 0, 0, root->nodesize);
+        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_nritems(c, 1);
        btrfs_set_header_level(c, level);
        btrfs_set_header_bytenr(c, c->start);
        btrfs_set_header_generation(c, trans->transid);
+        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(c, root->root_key.objectid);
        write_extent_buffer(c, root->fs_info->fsid,
@@ -2151,12 +2103,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        root->node = c;
        spin_unlock(&root->node_lock);
-        ret = btrfs_update_extent_ref(trans, root, lower->start,
-                                      lower->len, lower->start, c->start,
-                                      root->root_key.objectid,
-                                      trans->transid, level - 1);
-        BUG_ON(ret);
        /* the super has an extra ref to root->node */
        free_extent_buffer(old);
@@ -2233,7 +2179,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
-        } else if (!trans->transaction->delayed_refs.flushing) {
+        } else {
                ret = push_nodes_for_insert(trans, root, path, level);
                c = path->nodes[level];
                if (!ret && btrfs_header_nritems(c) <
@@ -2244,20 +2190,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        }
        c_nritems = btrfs_header_nritems(c);
+        mid = (c_nritems + 1) / 2;
+        btrfs_node_key(c, &disk_key, mid);
-        split = btrfs_alloc_free_block(trans, root, root->nodesize,
+        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-                                        path->nodes[level + 1]->start,
                                        root->root_key.objectid,
-                                        trans->transid, level, c->start, 0);
+                                        &disk_key, level, c->start, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
-        btrfs_set_header_flags(split, btrfs_header_flags(c));
+        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_level(split, btrfs_header_level(c));
        btrfs_set_header_bytenr(split, split->start);
        btrfs_set_header_generation(split, trans->transid);
+        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(split, root->root_key.objectid);
-        btrfs_set_header_flags(split, 0);
        write_extent_buffer(split, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(split),
                            BTRFS_FSID_SIZE);
@@ -2265,7 +2212,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                            (unsigned long)btrfs_header_chunk_tree_uuid(split),
                            BTRFS_UUID_SIZE);
-        mid = (c_nritems + 1) / 2;
        copy_extent_buffer(split, c,
                           btrfs_node_key_ptr_offset(0),
@@ -2278,16 +2224,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
        btrfs_mark_buffer_dirty(split);
-        btrfs_node_key(split, &disk_key, 0);
        wret = insert_ptr(trans, root, path, &disk_key, split->start,
                          path->slots[level + 1] + 1,
                          level + 1);
        if (wret)
                ret = wret;
-        ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
-        BUG_ON(ret);
        if (path->slots[level] >= mid) {
                path->slots[level] -= mid;
                btrfs_tree_unlock(c);
@@ -2360,7 +2302,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        u32 right_nritems;
        u32 data_end;
        u32 this_item_size;
-        int ret;
        if (empty)
                nr = 0;
@@ -2473,9 +2414,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
                btrfs_mark_buffer_dirty(left);
        btrfs_mark_buffer_dirty(right);
-        ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
-        BUG_ON(ret);
        btrfs_item_key(right, &disk_key, 0);
        btrfs_set_node_key(upper, &disk_key, slot + 1);
        btrfs_mark_buffer_dirty(upper);
@@ -2720,10 +2658,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        if (right_nritems)
                btrfs_mark_buffer_dirty(right);
-        ret = btrfs_update_ref(trans, root, right, left,
-                               old_left_nritems, push_items);
-        BUG_ON(ret);
        btrfs_item_key(right, &disk_key, 0);
        wret = fixup_low_keys(trans, root, path, &disk_key, 1);
        if (wret)
@@ -2880,9 +2814,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(l);
        BUG_ON(path->slots[0] != slot);
-        ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
-        BUG_ON(ret);
        if (mid <= slot) {
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
@@ -2911,6 +2842,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                               struct btrfs_path *path, int data_size,
                               int extend)
 {
+        struct btrfs_disk_key disk_key;
        struct extent_buffer *l;
        u32 nritems;
        int mid;
@@ -2918,12 +2850,11 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        struct extent_buffer *right;
        int ret = 0;
        int wret;
-        int double_split;
+        int split;
        int num_doubles = 0;
        /* first try to make some room by pushing left and right */
-        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
-            !trans->transaction->delayed_refs.flushing) {
                wret = push_leaf_right(trans, root, path, data_size, 0);
                if (wret < 0)
                        return wret;
@@ -2945,16 +2876,53 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                        return ret;
        }
 again:
-        double_split = 0;
+        split = 1;
        l = path->nodes[0];
        slot = path->slots[0];
        nritems = btrfs_header_nritems(l);
        mid = (nritems + 1) / 2;
-        right = btrfs_alloc_free_block(trans, root, root->leafsize,
+        if (mid <= slot) {
-                                        path->nodes[1]->start,
+                if (nritems == 1 ||
+                    leaf_space_used(l, mid, nritems - mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (slot >= nritems) {
+                                split = 0;
+                        } else {
+                                mid = slot;
+                                if (mid != nritems &&
+                                    leaf_space_used(l, mid, nritems - mid) +
+                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        split = 2;
+                                }
+                        }
+                }
+        } else {
+                if (leaf_space_used(l, 0, mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (!extend && data_size && slot == 0) {
+                                split = 0;
+                        } else if ((extend || !data_size) && slot == 0) {
+                                mid = 1;
+                        } else {
+                                mid = slot;
+                                if (mid != nritems &&
+                                    leaf_space_used(l, mid, nritems - mid) +
+                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        split = 2 ;
+                                }
+                        }
+                }
+        }
+        if (split == 0)
+                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+        else
+                btrfs_item_key(l, &disk_key, mid);
+        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
-                                        trans->transid, 0, l->start, 0);
+                                        &disk_key, 0, l->start, 0);
        if (IS_ERR(right)) {
                BUG_ON(1);
                return PTR_ERR(right);
@@ -2963,6 +2931,7 @@ again:
        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(right, right->start);
        btrfs_set_header_generation(right, trans->transid);
+        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(right, root->root_key.objectid);
        btrfs_set_header_level(right, 0);
        write_extent_buffer(right, root->fs_info->fsid,
@@ -2973,79 +2942,47 @@ again:
                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
                            BTRFS_UUID_SIZE);
-        if (mid <= slot) {
+        if (split == 0) {
-                if (nritems == 1 ||
+                if (mid <= slot) {
-                    leaf_space_used(l, mid, nritems - mid) + data_size >
+                        btrfs_set_header_nritems(right, 0);
-                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        wret = insert_ptr(trans, root, path,
-                        if (slot >= nritems) {
+                                          &disk_key, right->start,
-                                struct btrfs_disk_key disk_key;
+                                          path->slots[1] + 1, 1);
+                        if (wret)
-                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+                                ret = wret;
-                                btrfs_set_header_nritems(right, 0);
-                                wret = insert_ptr(trans, root, path,
-                                                  &disk_key, right->start,
-                                                  path->slots[1] + 1, 1);
-                                if (wret)
-                                        ret = wret;
-                                btrfs_tree_unlock(path->nodes[0]);
+                        btrfs_tree_unlock(path->nodes[0]);
-                                free_extent_buffer(path->nodes[0]);
+                        free_extent_buffer(path->nodes[0]);
-                                path->nodes[0] = right;
+                        path->nodes[0] = right;
-                                path->slots[0] = 0;
+                        path->slots[0] = 0;
-                                path->slots[1] += 1;
+                        path->slots[1] += 1;
-                                btrfs_mark_buffer_dirty(right);
+                } else {
-                                return ret;
+                        btrfs_set_header_nritems(right, 0);
-                        }
+                        wret = insert_ptr(trans, root, path,
-                        mid = slot;
+                                          &disk_key,
-                        if (mid != nritems &&
+                                          right->start,
-                            leaf_space_used(l, mid, nritems - mid) +
+                                          path->slots[1], 1);
-                            data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (wret)
-                                double_split = 1;
+                                ret = wret;
-                        }
+                        btrfs_tree_unlock(path->nodes[0]);
-                }
+                        free_extent_buffer(path->nodes[0]);
-        } else {
+                        path->nodes[0] = right;
-                if (leaf_space_used(l, 0, mid) + data_size >
+                        path->slots[0] = 0;
-                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (path->slots[1] == 0) {
-                        if (!extend && data_size && slot == 0) {
+                                wret = fixup_low_keys(trans, root,
-                                struct btrfs_disk_key disk_key;
+                                                path, &disk_key, 1);
-                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
-                                btrfs_set_header_nritems(right, 0);
-                                wret = insert_ptr(trans, root, path,
-                                                  &disk_key,
-                                                  right->start,
-                                                  path->slots[1], 1);
                                if (wret)
                                        ret = wret;
-                                btrfs_tree_unlock(path->nodes[0]);
-                                free_extent_buffer(path->nodes[0]);
-                                path->nodes[0] = right;
-                                path->slots[0] = 0;
-                                if (path->slots[1] == 0) {
-                                        wret = fixup_low_keys(trans, root,
-                                                      path, &disk_key, 1);
-                                        if (wret)
-                                                ret = wret;
-                                }
-                                btrfs_mark_buffer_dirty(right);
-                                return ret;
-                        } else if ((extend || !data_size) && slot == 0) {
-                                mid = 1;
-                        } else {
-                                mid = slot;
-                                if (mid != nritems &&
-                                    leaf_space_used(l, mid, nritems - mid) +
-                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
-                                        double_split = 1;
-                                }
                        }
                }
+                btrfs_mark_buffer_dirty(right);
+                return ret;
        }
        ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
        BUG_ON(ret);
-        if (double_split) {
+        if (split == 2) {
                BUG_ON(num_doubles != 0);
                num_doubles++;
                goto again;
@@ -3447,7 +3384,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
                /* figure out how many keys we can insert in here */
                total_data = data_size[0];
                for (i = 1; i < nr; i++) {
-                        if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+                        if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
                                break;
                        total_data += data_size[i];
                }
@@ -3745,9 +3682,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 /*
 * a helper function to delete the leaf pointed to by path->slots[1] and
- * path->nodes[1].  bytenr is the node block pointer, but since the callers
+ * path->nodes[1].
- * already know it, it is faster to have them pass it down than to
- * read it out of the node again.
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
@@ -3755,15 +3690,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
-noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
+                                   struct btrfs_root *root,
-                            struct btrfs_path *path, u64 bytenr)
+                                   struct btrfs_path *path,
+                                   struct extent_buffer *leaf)
 {
        int ret;
-        u64 root_gen = btrfs_header_generation(path->nodes[1]);
-        u64 parent_start = path->nodes[1]->start;
-        u64 parent_owner = btrfs_header_owner(path->nodes[1]);
+        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
        ret = del_ptr(trans, root, path, 1, path->slots[1]);
        if (ret)
                return ret;
@@ -3774,10 +3708,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
         */
        btrfs_unlock_up_safe(path, 0);
-        ret = btrfs_free_extent(trans, root, bytenr,
+        ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
-                                btrfs_level_size(root, 0),
+                                0, root->root_key.objectid, 0, 0);
-                                parent_start, parent_owner,
-                                root_gen, 0, 1);
        return ret;
 }
 /*
@@ -3845,7 +3777,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                if (leaf == root->node) {
                        btrfs_set_header_level(leaf, 0);
                } else {
-                        ret = btrfs_del_leaf(trans, root, path, leaf->start);
+                        ret = btrfs_del_leaf(trans, root, path, leaf);
                        BUG_ON(ret);
                }
        } else {
@@ -3861,8 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
                /* delete the leaf if it is mostly empty */
-                if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
-                    !trans->transaction->delayed_refs.flushing) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -3884,8 +3815,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        if (btrfs_header_nritems(leaf) == 0) {
                                path->slots[1] = slot;
-                                ret = btrfs_del_leaf(trans, root, path,
+                                ret = btrfs_del_leaf(trans, root, path, leaf);
-                                                     leaf->start);
                                BUG_ON(ret);
                                free_extent_buffer(leaf);
                        } else {
@@ -4098,10 +4028,9 @@ out:
 * calling this function.
 */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-                        struct btrfs_key *key, int lowest_level,
+                        struct btrfs_key *key, int level,
                        int cache_only, u64 min_trans)
 {
-        int level = lowest_level;
        int slot;
        struct extent_buffer *c;
@@ -4114,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                c = path->nodes[level];
 next:
                if (slot >= btrfs_header_nritems(c)) {
-                        level++;
+                        int ret;
-                        if (level == BTRFS_MAX_LEVEL)
+                        int orig_lowest;
+                        struct btrfs_key cur_key;
+                        if (level + 1 >= BTRFS_MAX_LEVEL ||
+                            !path->nodes[level + 1])
                                return 1;
-                        continue;
+                        if (path->locks[level + 1]) {
+                                level++;
+                                continue;
+                        }
+                        slot = btrfs_header_nritems(c) - 1;
+                        if (level == 0)
+                                btrfs_item_key_to_cpu(c, &cur_key, slot);
+                        else
+                                btrfs_node_key_to_cpu(c, &cur_key, slot);
+                        orig_lowest = path->lowest_level;
+                        btrfs_release_path(root, path);
+                        path->lowest_level = level;
+                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
+                                                0, 0);
+                        path->lowest_level = orig_lowest;
+                        if (ret < 0)
+                                return ret;
+                        c = path->nodes[level];
+                        slot = path->slots[level];
+                        if (ret == 0)
+                                slot++;
+                        goto next;
                }
                if (level == 0)
                        btrfs_item_key_to_cpu(c, key, slot);
                else {
@@ -4202,7 +4160,8 @@ again:
         * advance the path if there are now more items available.
         */
        if (nritems > 0 && path->slots[0] < nritems - 1) {
-                path->slots[0]++;
+                if (ret == 0)
+                        path->slots[0]++;
                ret = 0;
                goto done;
        }
@@ -4334,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
                        path->slots[0]--;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.type == type)
-                        return 0;
                if (found_key.objectid < min_objectid)
                        break;
+                if (found_key.type == type)
+                        return 0;
                if (found_key.objectid == min_objectid &&
                    found_key.type < type)
                        break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4414a5d9983a..837435ce84ca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,10 +41,10 @@ struct btrfs_ordered_sum;
 #define BTRFS_MAGIC "_BHRfS_M"
-#define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 #define BTRFS_MAX_LEVEL 8
+#define BTRFS_COMPAT_EXTENT_TREE_V0
 /*
 * files bigger than this get some pre-flushing when they are added
 * to the ordered operations list.  That way we limit the total
@@ -267,7 +267,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 }
 #define BTRFS_FSID_SIZE 16
-#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+#define BTRFS_HEADER_FLAG_WRITTEN       (1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC         (1ULL << 1)
+#define BTRFS_SUPER_FLAG_SEEDING        (1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP       (1ULL << 33)
+#define BTRFS_BACKREF_REV_MAX           256
+#define BTRFS_BACKREF_REV_SHIFT         56
+#define BTRFS_BACKREF_REV_MASK          (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+                                         BTRFS_BACKREF_REV_SHIFT)
+#define BTRFS_OLD_BACKREF_REV           0
+#define BTRFS_MIXED_BACKREF_REV         1
 /*
 * every tree block (leaf or node) starts with this header.
@@ -296,7 +307,6 @@ struct btrfs_header {
                                        sizeof(struct btrfs_item) - \
                                        sizeof(struct btrfs_file_extent_item))
-#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
 /*
 * this is a very generous portion of the super block, giving us
@@ -355,9 +365,12 @@ struct btrfs_super_block {
 * Compat flags that we support.  If any incompat flags are set other than the
 * ones specified below then we will fail to mount
 */
-#define BTRFS_FEATURE_COMPAT_SUPP       0x0
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF    (1ULL << 0)
-#define BTRFS_FEATURE_COMPAT_RO_SUPP    0x0
-#define BTRFS_FEATURE_INCOMPAT_SUPP     0x0
+#define BTRFS_FEATURE_COMPAT_SUPP               0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
+#define BTRFS_FEATURE_INCOMPAT_SUPP             \
+        BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -421,23 +434,65 @@ struct btrfs_path {
        unsigned int keep_locks:1;
        unsigned int skip_locking:1;
        unsigned int leave_spinning:1;
+        unsigned int search_commit_root:1;
 };
 /*
 * items in the extent btree are used to record the objectid of the
 * owner of the block and the number of references
 */
 struct btrfs_extent_item {
+        __le64 refs;
+        __le64 generation;
+        __le64 flags;
+} __attribute__ ((__packed__));
+struct btrfs_extent_item_v0 {
        __le32 refs;
 } __attribute__ ((__packed__));
-struct btrfs_extent_ref {
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+                                        sizeof(struct btrfs_item))
+#define BTRFS_EXTENT_FLAG_DATA          (1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK    (1ULL << 1)
+/* following flags only apply to tree blocks */
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF   (1ULL << 8)
+struct btrfs_tree_block_info {
+        struct btrfs_disk_key key;
+        u8 level;
+} __attribute__ ((__packed__));
+struct btrfs_extent_data_ref {
+        __le64 root;
+        __le64 objectid;
+        __le64 offset;
+        __le32 count;
+} __attribute__ ((__packed__));
+struct btrfs_shared_data_ref {
+        __le32 count;
+} __attribute__ ((__packed__));
+struct btrfs_extent_inline_ref {
+        u8 type;
+        __le64 offset;
+} __attribute__ ((__packed__));
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
        __le64 root;
        __le64 generation;
        __le64 objectid;
-        __le32 num_refs;
+        __le32 count;
 } __attribute__ ((__packed__));
 /* dev extents record free space on individual devices.  The owner
 * field points back to the chunk allocation mapping tree that allocated
 * the extent.  The chunk tree uuid field is a way to double check the owner
@@ -634,6 +689,7 @@ struct btrfs_space_info {
        struct list_head block_groups;
        spinlock_t lock;
        struct rw_semaphore groups_sem;
+        atomic_t caching_threads;
 };
 /*
@@ -652,6 +708,9 @@ struct btrfs_free_cluster {
        /* first extent starting offset */
        u64 window_start;
+        /* if this cluster simply points at a bitmap in the block group */
+        bool points_to_bitmap;
        struct btrfs_block_group_cache *block_group;
        /*
         * when a cluster is allocated from a block group, we put the
@@ -661,24 +720,37 @@ struct btrfs_free_cluster {
        struct list_head block_group_list;
 };
+enum btrfs_caching_type {
+        BTRFS_CACHE_NO          = 0,
+        BTRFS_CACHE_STARTED     = 1,
+        BTRFS_CACHE_FINISHED    = 2,
+};
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
+        struct btrfs_fs_info *fs_info;
        spinlock_t lock;
-        struct mutex cache_mutex;
        u64 pinned;
        u64 reserved;
        u64 flags;
-        int cached;
+        u64 sectorsize;
+        int extents_thresh;
+        int free_extents;
+        int total_bitmaps;
        int ro;
        int dirty;
+        /* cache tracking stuff */
+        wait_queue_head_t caching_q;
+        int cached;
        struct btrfs_space_info *space_info;
        /* free space cache stuff */
        spinlock_t tree_lock;
-        struct rb_root free_space_bytes;
        struct rb_root free_space_offset;
+        u64 free_space;
        /* block group cache stuff */
        struct rb_node cache_node;
@@ -695,12 +767,7 @@ struct btrfs_block_group_cache {
        struct list_head cluster_list;
 };
-struct btrfs_leaf_ref_tree {
+struct reloc_control;
-        struct rb_root root;
-        struct list_head list;
-        spinlock_t lock;
-};
 struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_fs_info {
@@ -758,6 +825,7 @@ struct btrfs_fs_info {
        struct mutex drop_mutex;
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+        struct rw_semaphore extent_commit_sem;
        /*
         * this protects the ordered operations list only while we are
@@ -831,18 +899,11 @@ struct btrfs_fs_info {
        struct task_struct *cleaner_kthread;
        int thread_pool_size;
-        /* tree relocation relocated fields */
-        struct list_head dead_reloc_roots;
-        struct btrfs_leaf_ref_tree reloc_ref_tree;
-        struct btrfs_leaf_ref_tree shared_ref_tree;
        struct kobject super_kobj;
        struct completion kobj_unregister;
        int do_barriers;
        int closing;
        int log_root_recovering;
-        atomic_t throttles;
-        atomic_t throttle_gen;
        u64 total_pinned;
@@ -861,6 +922,8 @@ struct btrfs_fs_info {
         */
        struct list_head space_info;
+        struct reloc_control *reloc_ctl;
        spinlock_t delalloc_lock;
        spinlock_t new_trans_lock;
        u64 delalloc_bytes;
@@ -891,7 +954,6 @@ struct btrfs_fs_info {
 * in ram representation of the tree.  extent_root is used for all allocations
 * and for the extent tree extent_root root.
 */
-struct btrfs_dirty_root;
 struct btrfs_root {
        struct extent_buffer *node;
@@ -899,9 +961,6 @@ struct btrfs_root {
        spinlock_t node_lock;
        struct extent_buffer *commit_root;
-        struct btrfs_leaf_ref_tree *ref_tree;
-        struct btrfs_leaf_ref_tree ref_tree_struct;
-        struct btrfs_dirty_root *dirty_root;
        struct btrfs_root *log_root;
        struct btrfs_root *reloc_root;
@@ -952,10 +1011,15 @@ struct btrfs_root {
        /* the dirty list is only used by non-reference counted roots */
        struct list_head dirty_list;
+        struct list_head root_list;
        spinlock_t list_lock;
-        struct list_head dead_list;
        struct list_head orphan_list;
+        spinlock_t inode_lock;
+        /* red-black tree that keeps track of in-memory inodes */
+        struct rb_root inode_tree;
        /*
         * right now this just gets used so that a root has its own devid
         * for stat.  It may be used for more later
@@ -1017,7 +1081,16 @@ struct btrfs_root {
 * are used, and how many references there are to each block
 */
 #define BTRFS_EXTENT_ITEM_KEY   168
-#define BTRFS_EXTENT_REF_KEY    180
+#define BTRFS_TREE_BLOCK_REF_KEY        176
+#define BTRFS_EXTENT_DATA_REF_KEY       178
+#define BTRFS_EXTENT_REF_V0_KEY         180
+#define BTRFS_SHARED_BLOCK_REF_KEY      182
+#define BTRFS_SHARED_DATA_REF_KEY       184
 /*
 * block groups give us hints into the extent allocation trees.  Which
@@ -1043,6 +1116,8 @@ struct btrfs_root {
 #define BTRFS_MOUNT_COMPRESS            (1 << 5)
 #define BTRFS_MOUNT_NOTREELOG           (1 << 6)
 #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
+#define BTRFS_MOUNT_SSD_SPREAD          (1 << 8)
+#define BTRFS_MOUNT_NOSSD               (1 << 9)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1056,12 +1131,14 @@ struct btrfs_root {
 #define BTRFS_INODE_READONLY            (1 << 2)
 #define BTRFS_INODE_NOCOMPRESS          (1 << 3)
 #define BTRFS_INODE_PREALLOC            (1 << 4)
-#define btrfs_clear_flag(inode, flag)   (BTRFS_I(inode)->flags &= \
+#define BTRFS_INODE_SYNC                (1 << 5)
-                                         ~BTRFS_INODE_##flag)
+#define BTRFS_INODE_IMMUTABLE           (1 << 6)
-#define btrfs_set_flag(inode, flag)     (BTRFS_I(inode)->flags |= \
+#define BTRFS_INODE_APPEND              (1 << 7)
-                                         BTRFS_INODE_##flag)
+#define BTRFS_INODE_NODUMP              (1 << 8)
-#define btrfs_test_flag(inode, flag)    (BTRFS_I(inode)->flags & \
+#define BTRFS_INODE_NOATIME             (1 << 9)
-                                         BTRFS_INODE_##flag)
+#define BTRFS_INODE_DIRSYNC             (1 << 10)
 /* some macros to generate set/get funcs for the struct fields.  This
 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
 * one for u8:
@@ -1317,24 +1394,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
        return (u8 *)((unsigned long)dev + ptr);
 }
-/* struct btrfs_extent_ref */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
-BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
-BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
+                   generation, 64);
-BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
-BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
+BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
-                         generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
-                         objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
-                         num_refs, 32);
-/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
+static inline void btrfs_tree_block_key(struct extent_buffer *eb,
-                         refs, 32);
+                                        struct btrfs_tree_block_info *item,
+                                        struct btrfs_disk_key *key)
+{
+        read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+                                            struct btrfs_tree_block_info *item,
+                                            struct btrfs_disk_key *key)
+{
+        write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+                   root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+                   objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+                   offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+                   count, 32);
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+                   count, 32);
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+                   type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+                   offset, 64);
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+        if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+            type == BTRFS_SHARED_BLOCK_REF_KEY)
+                return sizeof(struct btrfs_extent_inline_ref);
+        if (type == BTRFS_SHARED_DATA_REF_KEY)
+                return sizeof(struct btrfs_shared_data_ref) +
+                       sizeof(struct btrfs_extent_inline_ref);
+        if (type == BTRFS_EXTENT_DATA_REF_KEY)
+                return sizeof(struct btrfs_extent_data_ref) +
+                       offsetof(struct btrfs_extent_inline_ref, offset);
+        BUG();
+        return 0;
+}
+BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
+                   generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
@@ -1558,6 +1678,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
        return (flags & flag) == flag;
 }
+static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+{
+        u64 flags = btrfs_header_flags(eb);
+        return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+                                                int rev)
+{
+        u64 flags = btrfs_header_flags(eb);
+        flags &= ~BTRFS_BACKREF_REV_MASK;
+        flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+        btrfs_set_header_flags(eb, flags);
+}
 static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 {
        unsigned long ptr = offsetof(struct btrfs_header, fsid);
@@ -1790,39 +1925,32 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 objectid, u64 bytenr);
+                          struct btrfs_root *root,
+                          u64 objectid, u64 offset, u64 bytenr);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
                                                 u64 bytenr);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                             struct btrfs_root *root,
+                                        struct btrfs_root *root, u32 blocksize,
-                                             u32 blocksize, u64 parent,
+                                        u64 parent, u64 root_objectid,
-                                             u64 root_objectid,
+                                        struct btrfs_disk_key *key, int level,
-                                             u64 ref_generation,
+                                        u64 hint, u64 empty_size);
-                                             int level,
-                                             u64 hint,
-                                             u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
                                            int level);
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
+                                     struct btrfs_root *root,
-                       u64 num_bytes, u64 parent, u64 min_bytes,
+                                     u64 root_objectid, u64 owner,
-                       u64 root_objectid, u64 ref_generation,
+                                     u64 offset, struct btrfs_key *ins);
-                       u64 owner, u64 empty_size, u64 hint_byte,
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
-                       u64 search_end, struct btrfs_key *ins, u64 data);
+                                   struct btrfs_root *root,
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                   u64 root_objectid, u64 owner, u64 offset,
-                                struct btrfs_root *root, u64 parent,
+                                   struct btrfs_key *ins);
-                                u64 root_objectid, u64 ref_generation,
-                                u64 owner, struct btrfs_key *ins);
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, u64 parent,
-                                u64 root_objectid, u64 ref_generation,
-                                u64 owner, struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  u64 num_bytes, u64 min_alloc_size,
@@ -1830,18 +1958,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  u64 search_end, struct btrfs_key *ins,
                                  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+                  struct extent_buffer *buf, int full_backref);
-                  u32 *nr_extents);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                  struct extent_buffer *buf, int full_backref);
-                    struct extent_buffer *buf, u32 nr_extents);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
-                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                                u64 bytenr, u64 num_bytes, u64 flags,
-                     struct extent_buffer *buf, int start_slot, int nr);
+                                int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
-                      u64 root_objectid, u64 ref_generation,
+                      u64 root_objectid, u64 owner, u64 offset);
-                      u64 owner_objectid, int pin);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
@@ -1849,13 +1977,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                         u64 root_objectid, u64 ref_generation,
+                         u64 root_objectid, u64 owner, u64 offset);
-                         u64 owner_objectid);
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr, u64 num_bytes,
-                            u64 orig_parent, u64 parent,
-                            u64 root_objectid, u64 ref_generation,
-                            u64 owner_objectid);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
@@ -1867,16 +1990,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+                                struct btrfs_block_group_cache *group);
-                          struct btrfs_root *root);
-int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
-int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct extent_buffer *buf, u64 orig_start);
-int btrfs_add_dead_reloc_root(struct btrfs_root *root);
-int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -1890,14 +2006,14 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
                                 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
                              u64 bytes);
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
 /* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+                     int level, int *slot);
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid,
                        int type);
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     struct btrfs_key *node_keys,
-                     u64 *nodes, int lowest_level);
 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, struct btrfs_path *path,
                            struct btrfs_key *new_key);
@@ -1918,6 +2034,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct extent_buffer *buf,
                      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+                              struct extent_buffer *buf);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1944,9 +2062,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int slot, int nr);
-int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            struct btrfs_path *path, u64 bytenr);
 static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path)
@@ -1978,8 +2093,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
-                        *root);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2005,8 +2119,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
                         btrfs_root_item *item, struct btrfs_key *key);
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
                      u64 *found_objectid);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
-                          struct btrfs_root *latest_root);
+int btrfs_set_root_node(struct btrfs_root_item *item,
+                        struct extent_buffer *node);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, const char *name,
@@ -2139,7 +2254,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
-void btrfs_read_locked_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -2147,12 +2261,8 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
-                            struct btrfs_root *root, int wait);
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-                                struct btrfs_root *root);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-                         struct btrfs_root *root, int *is_new);
+                         struct btrfs_root *root);
 int btrfs_commit_write(struct file *file, struct page *page,
                       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2168,6 +2278,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size);
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void btrfs_update_iflags(struct inode *inode);
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 /* file.c */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
@@ -2205,8 +2317,20 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 /* acl.c */
+#ifdef CONFIG_FS_POSIX_ACL
 int btrfs_check_acl(struct inode *inode, int mask);
+#else
+#define btrfs_check_acl NULL
+#endif
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index d6c01c096a40..84e6781413b1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -29,27 +29,87 @@
 * add extents in the middle of btrfs_search_slot, and it allows
 * us to buffer up frequently modified backrefs in an rb tree instead
 * of hammering updates on the extent allocation tree.
- *
- * Right now this code is only used for reference counted trees, but
- * the long term goal is to get rid of the similar code for delayed
- * extent tree modifications.
 */
 /*
- * entries in the rb tree are ordered by the byte number of the extent
+ * compare two delayed tree backrefs with same bytenr and type
- * and by the byte number of the parent block.
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
+                          struct btrfs_delayed_tree_ref *ref1)
+{
+        if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+                if (ref1->root < ref2->root)
+                        return -1;
+                if (ref1->root > ref2->root)
+                        return 1;
+        } else {
+                if (ref1->parent < ref2->parent)
+                        return -1;
+                if (ref1->parent > ref2->parent)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * compare two delayed data backrefs with same bytenr and type
 */
-static int comp_entry(struct btrfs_delayed_ref_node *ref,
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
-                      u64 bytenr, u64 parent)
+                          struct btrfs_delayed_data_ref *ref1)
 {
-        if (bytenr < ref->bytenr)
+        if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                if (ref1->root < ref2->root)
+                        return -1;
+                if (ref1->root > ref2->root)
+                        return 1;
+                if (ref1->objectid < ref2->objectid)
+                        return -1;
+                if (ref1->objectid > ref2->objectid)
+                        return 1;
+                if (ref1->offset < ref2->offset)
+                        return -1;
+                if (ref1->offset > ref2->offset)
+                        return 1;
+        } else {
+                if (ref1->parent < ref2->parent)
+                        return -1;
+                if (ref1->parent > ref2->parent)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * entries in the rb tree are ordered by the byte number of the extent,
+ * type of the delayed backrefs and content of delayed backrefs.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref2,
+                      struct btrfs_delayed_ref_node *ref1)
+{
+        if (ref1->bytenr < ref2->bytenr)
                return -1;
-        if (bytenr > ref->bytenr)
+        if (ref1->bytenr > ref2->bytenr)
                return 1;
-        if (parent < ref->parent)
+        if (ref1->is_head && ref2->is_head)
+                return 0;
+        if (ref2->is_head)
                return -1;
-        if (parent > ref->parent)
+        if (ref1->is_head)
                return 1;
+        if (ref1->type < ref2->type)
+                return -1;
+        if (ref1->type > ref2->type)
+                return 1;
+        if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+            ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+                return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
+                                      btrfs_delayed_node_to_tree_ref(ref1));
+        } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
+                   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+                return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
+                                      btrfs_delayed_node_to_data_ref(ref1));
+        }
+        BUG();
        return 0;
 }
@@ -59,20 +119,21 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref,
 * inserted.
 */
 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
-                                                  u64 bytenr, u64 parent,
                                                  struct rb_node *node)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent_node = NULL;
        struct btrfs_delayed_ref_node *entry;
+        struct btrfs_delayed_ref_node *ins;
        int cmp;
+        ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
        while (*p) {
                parent_node = *p;
                entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
                                 rb_node);
-                cmp = comp_entry(entry, bytenr, parent);
+                cmp = comp_entry(entry, ins);
                if (cmp < 0)
                        p = &(*p)->rb_left;
                else if (cmp > 0)
@@ -81,18 +142,17 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
                        return entry;
        }
-        entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
        rb_link_node(node, parent_node, p);
        rb_insert_color(node, root);
        return NULL;
 }
 /*
- * find an entry based on (bytenr,parent).  This returns the delayed
+ * find an head entry based on bytenr. This returns the delayed ref
- * ref if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot
 */
-static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
-                                  u64 bytenr, u64 parent,
+                                  u64 bytenr,
                                  struct btrfs_delayed_ref_node **last)
 {
        struct rb_node *n = root->rb_node;
@@ -105,7 +165,15 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
                if (last)
                        *last = entry;
-                cmp = comp_entry(entry, bytenr, parent);
+                if (bytenr < entry->bytenr)
+                        cmp = -1;
+                else if (bytenr > entry->bytenr)
+                        cmp = 1;
+                else if (!btrfs_delayed_ref_is_head(entry))
+                        cmp = 1;
+                else
+                        cmp = 0;
                if (cmp < 0)
                        n = n->rb_left;
                else if (cmp > 0)
@@ -154,7 +222,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                node = rb_first(&delayed_refs->root);
        } else {
                ref = NULL;
-                tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+                find_ref_head(&delayed_refs->root, start, &ref);
                if (ref) {
                        struct btrfs_delayed_ref_node *tmp;
@@ -234,7 +302,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
-        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
        if (ref) {
                prev_node = rb_prev(&ref->rb_node);
                if (!prev_node)
@@ -250,25 +318,28 @@ out:
 }
 /*
- * helper function to lookup reference count
+ * helper function to lookup reference count and flags of extent.
 *
 * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree.  This way you
+ * reference count modifications queued up in the rbtree. the head
- * can check to see what the reference count would be if all of the
+ * node may also store the extent flags to set. This way you can check
- * delayed refs are processed.
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
 */
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
+                             struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u32 *refs)
+                             u64 num_bytes, u64 *refs, u64 *flags)
 {
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *head;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_path *path;
-        struct extent_buffer *leaf;
        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
        struct btrfs_key key;
-        u32 num_refs;
+        u32 item_size;
+        u64 num_refs;
+        u64 extent_flags;
        int ret;
        path = btrfs_alloc_path();
@@ -287,37 +358,60 @@ again:
        if (ret == 0) {
                leaf = path->nodes[0];
-                ei = btrfs_item_ptr(leaf, path->slots[0],
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                                    struct btrfs_extent_item);
+                if (item_size >= sizeof(*ei)) {
-                num_refs = btrfs_extent_refs(leaf, ei);
+                        ei = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_extent_item);
+                        num_refs = btrfs_extent_refs(leaf, ei);
+                        extent_flags = btrfs_extent_flags(leaf, ei);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        struct btrfs_extent_item_v0 *ei0;
+                        BUG_ON(item_size != sizeof(*ei0));
+                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_item_v0);
+                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
+                        /* FIXME: this isn't correct for data */
+                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+                        BUG();
+#endif
+                }
+                BUG_ON(num_refs == 0);
        } else {
                num_refs = 0;
+                extent_flags = 0;
                ret = 0;
        }
        spin_lock(&delayed_refs->lock);
-        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
        if (ref) {
                head = btrfs_delayed_node_to_head(ref);
-                if (mutex_trylock(&head->mutex)) {
+                if (!mutex_trylock(&head->mutex)) {
-                        num_refs += ref->ref_mod;
+                        atomic_inc(&ref->refs);
-                        mutex_unlock(&head->mutex);
+                        spin_unlock(&delayed_refs->lock);
-                        *refs = num_refs;
-                        goto out;
-                }
-                atomic_inc(&ref->refs);
+                        btrfs_release_path(root->fs_info->extent_root, path);
-                spin_unlock(&delayed_refs->lock);
-                btrfs_release_path(root->fs_info->extent_root, path);
+                        mutex_lock(&head->mutex);
+                        mutex_unlock(&head->mutex);
+                        btrfs_put_delayed_ref(ref);
+                        goto again;
+                }
+                if (head->extent_op && head->extent_op->update_flags)
+                        extent_flags |= head->extent_op->flags_to_set;
+                else
+                        BUG_ON(num_refs == 0);
-                mutex_lock(&head->mutex);
+                num_refs += ref->ref_mod;
                mutex_unlock(&head->mutex);
-                btrfs_put_delayed_ref(ref);
-                goto again;
-        } else {
-                *refs = num_refs;
        }
+        WARN_ON(num_refs == 0);
+        if (refs)
+                *refs = num_refs;
+        if (flags)
+                *flags = extent_flags;
 out:
        spin_unlock(&delayed_refs->lock);
        btrfs_free_path(path);
@@ -338,16 +432,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
                    struct btrfs_delayed_ref_node *existing,
                    struct btrfs_delayed_ref_node *update)
 {
-        struct btrfs_delayed_ref *existing_ref;
+        if (update->action != existing->action) {
-        struct btrfs_delayed_ref *ref;
-        existing_ref = btrfs_delayed_node_to_ref(existing);
-        ref = btrfs_delayed_node_to_ref(update);
-        if (ref->pin)
-                existing_ref->pin = 1;
-        if (ref->action != existing_ref->action) {
                /*
                 * this is effectively undoing either an add or a
                 * drop.  We decrement the ref_mod, and if it goes
@@ -363,20 +448,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
                        delayed_refs->num_entries--;
                        if (trans->delayed_ref_updates)
                                trans->delayed_ref_updates--;
+                } else {
+                        WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+                                existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
                }
        } else {
-                if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+                WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
-                        /* if we're adding refs, make sure all the
+                        existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-                         * details match up.  The extent could
-                         * have been totally freed and reallocated
-                         * by a different owner before the delayed
-                         * ref entries were removed.
-                         */
-                        existing_ref->owner_objectid = ref->owner_objectid;
-                        existing_ref->generation = ref->generation;
-                        existing_ref->root = ref->root;
-                        existing->num_bytes = update->num_bytes;
-                }
                /*
                 * the action on the existing ref matches
                 * the action on the ref we're trying to add.
@@ -401,6 +479,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
        existing_ref = btrfs_delayed_node_to_head(existing);
        ref = btrfs_delayed_node_to_head(update);
+        BUG_ON(existing_ref->is_data != ref->is_data);
        if (ref->must_insert_reserved) {
                /* if the extent was freed and then
@@ -420,6 +499,24 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
        }
+        if (ref->extent_op) {
+                if (!existing_ref->extent_op) {
+                        existing_ref->extent_op = ref->extent_op;
+                } else {
+                        if (ref->extent_op->update_key) {
+                                memcpy(&existing_ref->extent_op->key,
+                                       &ref->extent_op->key,
+                                       sizeof(ref->extent_op->key));
+                                existing_ref->extent_op->update_key = 1;
+                        }
+                        if (ref->extent_op->update_flags) {
+                                existing_ref->extent_op->flags_to_set |=
+                                        ref->extent_op->flags_to_set;
+                                existing_ref->extent_op->update_flags = 1;
+                        }
+                        kfree(ref->extent_op);
+                }
+        }
        /*
         * update the reference mod on the head to reflect this new operation
         */
@@ -427,19 +524,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 }
 /*
- * helper function to actually insert a delayed ref into the rbtree.
+ * helper function to actually insert a head node into the rbtree.
 * this does all the dirty work in terms of maintaining the correct
- * overall modification count in the head node and properly dealing
+ * overall modification count.
- * with updating existing nodes as new modifications are queued.
 */
-static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
-                          struct btrfs_delayed_ref_node *ref,
+                                        struct btrfs_delayed_ref_node *ref,
-                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                                        u64 bytenr, u64 num_bytes,
-                          u64 ref_generation, u64 owner_objectid, int action,
+                                        int action, int is_data)
-                          int pin)
 {
        struct btrfs_delayed_ref_node *existing;
-        struct btrfs_delayed_ref *full_ref;
        struct btrfs_delayed_ref_head *head_ref = NULL;
        struct btrfs_delayed_ref_root *delayed_refs;
        int count_mod = 1;
@@ -449,12 +543,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
         * the head node stores the sum of all the mods, so dropping a ref
         * should drop the sum in the head node by one.
         */
-        if (parent == (u64)-1) {
+        if (action == BTRFS_UPDATE_DELAYED_HEAD)
-                if (action == BTRFS_DROP_DELAYED_REF)
+                count_mod = 0;
-                        count_mod = -1;
+        else if (action == BTRFS_DROP_DELAYED_REF)
-                else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+                count_mod = -1;
-                        count_mod = 0;
-        }
        /*
         * BTRFS_ADD_DELAYED_EXTENT means that we need to update
@@ -467,57 +559,148 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
         * Once we record must_insert_reserved, switch the action to
         * BTRFS_ADD_DELAYED_REF because other special casing is not required.
         */
-        if (action == BTRFS_ADD_DELAYED_EXTENT) {
+        if (action == BTRFS_ADD_DELAYED_EXTENT)
                must_insert_reserved = 1;
-                action = BTRFS_ADD_DELAYED_REF;
+        else
-        } else {
                must_insert_reserved = 0;
-        }
        delayed_refs = &trans->transaction->delayed_refs;
        /* first set the basic ref node struct up */
        atomic_set(&ref->refs, 1);
        ref->bytenr = bytenr;
-        ref->parent = parent;
+        ref->num_bytes = num_bytes;
        ref->ref_mod = count_mod;
+        ref->type  = 0;
+        ref->action  = 0;
+        ref->is_head = 1;
        ref->in_tree = 1;
+        head_ref = btrfs_delayed_node_to_head(ref);
+        head_ref->must_insert_reserved = must_insert_reserved;
+        head_ref->is_data = is_data;
+        INIT_LIST_HEAD(&head_ref->cluster);
+        mutex_init(&head_ref->mutex);
+        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+        if (existing) {
+                update_existing_head_ref(existing, ref);
+                /*
+                 * we've updated the existing ref, free the newly
+                 * allocated ref
+                 */
+                kfree(ref);
+        } else {
+                delayed_refs->num_heads++;
+                delayed_refs->num_heads_ready++;
+                delayed_refs->num_entries++;
+                trans->delayed_ref_updates++;
+        }
+        return 0;
+}
+/*
+ * helper to insert a delayed tree ref into the rbtree.
+ */
+static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+                                         struct btrfs_delayed_ref_node *ref,
+                                         u64 bytenr, u64 num_bytes, u64 parent,
+                                         u64 ref_root, int level, int action)
+{
+        struct btrfs_delayed_ref_node *existing;
+        struct btrfs_delayed_tree_ref *full_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        if (action == BTRFS_ADD_DELAYED_EXTENT)
+                action = BTRFS_ADD_DELAYED_REF;
+        delayed_refs = &trans->transaction->delayed_refs;
+        /* first set the basic ref node struct up */
+        atomic_set(&ref->refs, 1);
+        ref->bytenr = bytenr;
        ref->num_bytes = num_bytes;
+        ref->ref_mod = 1;
+        ref->action = action;
+        ref->is_head = 0;
+        ref->in_tree = 1;
-        if (btrfs_delayed_ref_is_head(ref)) {
+        full_ref = btrfs_delayed_node_to_tree_ref(ref);
-                head_ref = btrfs_delayed_node_to_head(ref);
+        if (parent) {
-                head_ref->must_insert_reserved = must_insert_reserved;
+                full_ref->parent = parent;
-                INIT_LIST_HEAD(&head_ref->cluster);
+                ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-                mutex_init(&head_ref->mutex);
        } else {
-                full_ref = btrfs_delayed_node_to_ref(ref);
                full_ref->root = ref_root;
-                full_ref->generation = ref_generation;
+                ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-                full_ref->owner_objectid = owner_objectid;
-                full_ref->pin = pin;
-                full_ref->action = action;
        }
+        full_ref->level = level;
-        existing = tree_insert(&delayed_refs->root, bytenr,
+        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-                               parent, &ref->rb_node);
        if (existing) {
-                if (btrfs_delayed_ref_is_head(ref))
+                update_existing_ref(trans, delayed_refs, existing, ref);
-                        update_existing_head_ref(existing, ref);
+                /*
-                else
+                 * we've updated the existing ref, free the newly
-                        update_existing_ref(trans, delayed_refs, existing, ref);
+                 * allocated ref
+                 */
+                kfree(ref);
+        } else {
+                delayed_refs->num_entries++;
+                trans->delayed_ref_updates++;
+        }
+        return 0;
+}
+/*
+ * helper to insert a delayed data ref into the rbtree.
+ */
+static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                                         struct btrfs_delayed_ref_node *ref,
+                                         u64 bytenr, u64 num_bytes, u64 parent,
+                                         u64 ref_root, u64 owner, u64 offset,
+                                         int action)
+{
+        struct btrfs_delayed_ref_node *existing;
+        struct btrfs_delayed_data_ref *full_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        if (action == BTRFS_ADD_DELAYED_EXTENT)
+                action = BTRFS_ADD_DELAYED_REF;
+        delayed_refs = &trans->transaction->delayed_refs;
+        /* first set the basic ref node struct up */
+        atomic_set(&ref->refs, 1);
+        ref->bytenr = bytenr;
+        ref->num_bytes = num_bytes;
+        ref->ref_mod = 1;
+        ref->action = action;
+        ref->is_head = 0;
+        ref->in_tree = 1;
+        full_ref = btrfs_delayed_node_to_data_ref(ref);
+        if (parent) {
+                full_ref->parent = parent;
+                ref->type = BTRFS_SHARED_DATA_REF_KEY;
+        } else {
+                full_ref->root = ref_root;
+                ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+        }
+        full_ref->objectid = owner;
+        full_ref->offset = offset;
+        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+        if (existing) {
+                update_existing_ref(trans, delayed_refs, existing, ref);
                /*
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
                kfree(ref);
        } else {
-                if (btrfs_delayed_ref_is_head(ref)) {
-                        delayed_refs->num_heads++;
-                        delayed_refs->num_heads_ready++;
-                }
                delayed_refs->num_entries++;
                trans->delayed_ref_updates++;
        }
@@ -525,37 +708,78 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 }
 /*
- * add a delayed ref to the tree.  This does all of the accounting required
+ * add a delayed tree ref.  This does all of the accounting required
 * to make sure the delayed ref is eventually processed before this
 * transaction commits.
 */
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                               u64 bytenr, u64 num_bytes, u64 parent,
-                          u64 ref_generation, u64 owner_objectid, int action,
+                               u64 ref_root,  int level, int action,
-                          int pin)
+                               struct btrfs_delayed_extent_op *extent_op)
 {
-        struct btrfs_delayed_ref *ref;
+        struct btrfs_delayed_tree_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        int ret;
+        BUG_ON(extent_op && extent_op->is_data);
        ref = kmalloc(sizeof(*ref), GFP_NOFS);
        if (!ref)
                return -ENOMEM;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref) {
+                kfree(ref);
+                return -ENOMEM;
+        }
+        head_ref->extent_op = extent_op;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
        /*
-         * the parent = 0 case comes from cases where we don't actually
+         * insert both the head node and the new ref without dropping
-         * know the parent yet.  It will get updated later via a add/drop
+         * the spin lock
-         * pair.
         */
-        if (parent == 0)
+        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                parent = bytenr;
+                                   action, 0);
+        BUG_ON(ret);
+        ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
+                                   parent, ref_root, level, action);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                               u64 bytenr, u64 num_bytes,
+                               u64 parent, u64 ref_root,
+                               u64 owner, u64 offset, int action,
+                               struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_delayed_data_ref *ref;
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        BUG_ON(extent_op && !extent_op->is_data);
+        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        if (!ref)
+                return -ENOMEM;
        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
        if (!head_ref) {
                kfree(ref);
                return -ENOMEM;
        }
+        head_ref->extent_op = extent_op;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
@@ -563,14 +787,39 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-        ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                                      (u64)-1, 0, 0, 0, action, pin);
+                                   action, 1);
        BUG_ON(ret);
-        ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+        ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-                                      parent, ref_root, ref_generation,
+                                   parent, ref_root, owner, offset, action);
-                                      owner_objectid, action, pin);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                                u64 bytenr, u64 num_bytes,
+                                struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref)
+                return -ENOMEM;
+        head_ref->extent_op = extent_op;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+                                   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+                                   extent_op->is_data);
        BUG_ON(ret);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -587,7 +836,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
        struct btrfs_delayed_ref_root *delayed_refs;
        delayed_refs = &trans->transaction->delayed_refs;
-        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
        if (ref)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
@@ -603,6 +852,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 *
 * It is the same as doing a ref add and delete in two separate calls.
 */
+#if 0
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -666,3 +916,4 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
+#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 3bec2ff0b15c..f6fc67ddad36 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -30,9 +30,6 @@ struct btrfs_delayed_ref_node {
        /* the starting bytenr of the extent */
        u64 bytenr;
-        /* the parent our backref will point to */
-        u64 parent;
        /* the size of the extent */
        u64 num_bytes;
@@ -50,10 +47,21 @@ struct btrfs_delayed_ref_node {
         */
        int ref_mod;
+        unsigned int action:8;
+        unsigned int type:8;
        /* is this node still in the rbtree? */
+        unsigned int is_head:1;
        unsigned int in_tree:1;
 };
+struct btrfs_delayed_extent_op {
+        struct btrfs_disk_key key;
+        u64 flags_to_set;
+        unsigned int update_key:1;
+        unsigned int update_flags:1;
+        unsigned int is_data:1;
+};
 /*
 * the head refs are used to hold a lock on a given extent, which allows us
 * to make sure that only one process is running the delayed refs
@@ -71,6 +79,7 @@ struct btrfs_delayed_ref_head {
        struct list_head cluster;
+        struct btrfs_delayed_extent_op *extent_op;
        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
@@ -84,27 +93,26 @@ struct btrfs_delayed_ref_head {
         * the free has happened.
         */
        unsigned int must_insert_reserved:1;
+        unsigned int is_data:1;
 };
-struct btrfs_delayed_ref {
+struct btrfs_delayed_tree_ref {
        struct btrfs_delayed_ref_node node;
+        union {
+                u64 root;
+                u64 parent;
+        };
+        int level;
+};
-        /* the root objectid our ref will point to */
+struct btrfs_delayed_data_ref {
-        u64 root;
+        struct btrfs_delayed_ref_node node;
+        union {
-        /* the generation for the backref */
+                u64 root;
-        u64 generation;
+                u64 parent;
+        };
-        /* owner_objectid of the backref  */
+        u64 objectid;
-        u64 owner_objectid;
+        u64 offset;
-        /* operation done by this entry in the rbtree */
-        u8 action;
-        /* if pin == 1, when the extent is freed it will be pinned until
-         * transaction commit
-         */
-        unsigned int pin:1;
 };
 struct btrfs_delayed_ref_root {
@@ -143,17 +151,25 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
        }
 }
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                               u64 bytenr, u64 num_bytes, u64 parent,
-                          u64 ref_generation, u64 owner_objectid, int action,
+                               u64 ref_root, int level, int action,
-                          int pin);
+                               struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                               u64 bytenr, u64 num_bytes,
+                               u64 parent, u64 ref_root,
+                               u64 owner, u64 offset, int action,
+                               struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                                u64 bytenr, u64 num_bytes,
+                                struct btrfs_delayed_extent_op *extent_op);
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
+                             struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u32 *refs);
+                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -169,18 +185,24 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 */
 static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
 {
-        return node->parent == (u64)-1;
+        return node->is_head;
 }
 /*
 * helper functions to cast a node into its container
 */
-static inline struct btrfs_delayed_ref *
+static inline struct btrfs_delayed_tree_ref *
-btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
 {
        WARN_ON(btrfs_delayed_ref_is_head(node));
-        return container_of(node, struct btrfs_delayed_ref, node);
+        return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+        WARN_ON(btrfs_delayed_ref_is_head(node));
+        return container_of(node, struct btrfs_delayed_data_ref, node);
 }
 static inline struct btrfs_delayed_ref_head *
@@ -188,6 +210,5 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
 {
        WARN_ON(!btrfs_delayed_ref_is_head(node));
        return container_of(node, struct btrfs_delayed_ref_head, node);
 }
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b0ea0b80c23..e83be2e4602c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,8 +26,8 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/crc32c.h>
 #include "compat.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -36,13 +36,14 @@
 #include "print-tree.h"
 #include "async-thread.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
+static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
 /*
 * end_io_wq structs are used to do processing in task context when an IO is
 * complete.  This is used during reads to verify checksums, and it is used
@@ -172,7 +173,7 @@ out:
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 {
-        return btrfs_crc32c(seed, data, len);
+        return crc32c(seed, data, len);
 }
 void btrfs_csum_final(u32 crc, char *result)
@@ -884,7 +885,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 {
        root->node = NULL;
        root->commit_root = NULL;
-        root->ref_tree = NULL;
        root->sectorsize = sectorsize;
        root->nodesize = nodesize;
        root->leafsize = leafsize;
@@ -899,12 +899,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->last_inode_alloc = 0;
        root->name = NULL;
        root->in_sysfs = 0;
+        root->inode_tree.rb_node = NULL;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
-        INIT_LIST_HEAD(&root->dead_list);
+        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->node_lock);
        spin_lock_init(&root->list_lock);
+        spin_lock_init(&root->inode_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -918,9 +920,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        extent_io_tree_init(&root->dirty_log_pages,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
-        btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
-        root->ref_tree = &root->ref_tree_struct;
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -959,6 +958,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
+        root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
        return 0;
 }
@@ -1025,20 +1025,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
         */
        root->ref_cows = 0;
-        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-                                      0, BTRFS_TREE_LOG_OBJECTID,
+                                      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
-                                      trans->transid, 0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
        }
+        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+        btrfs_set_header_bytenr(leaf, leaf->start);
+        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+        btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
        root->node = leaf;
-        btrfs_set_header_nritems(root->node, 0);
-        btrfs_set_header_level(root->node, 0);
-        btrfs_set_header_bytenr(root->node, root->node->start);
-        btrfs_set_header_generation(root->node, trans->transid);
-        btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
        write_extent_buffer(root->node, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(root->node),
@@ -1081,8 +1080,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
        inode_item->nbytes = cpu_to_le64(root->leafsize);
        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
-        btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
+        btrfs_set_root_node(&log_root->root_item, log_root->node);
-        btrfs_set_root_generation(&log_root->root_item, trans->transid);
        WARN_ON(root->log_root);
        root->log_root = log_root;
@@ -1144,6 +1142,7 @@ out:
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
+        root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
 insert:
        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1210,7 +1209,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
        }
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                            root->root_key.objectid, root);
+                                            root->root_key.objectid);
                BUG_ON(ret);
                btrfs_orphan_cleanup(root);
        }
@@ -1345,12 +1344,25 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
        free_extent_map(em);
 }
+/*
+ * If this fails, caller must call bdi_destroy() to get rid of the
+ * bdi again.
+ */
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
-        bdi_init(bdi);
+        int err;
+        bdi->capabilities = BDI_CAP_MAP_COPY;
+        err = bdi_init(bdi);
+        if (err)
+                return err;
+        err = bdi_register(bdi, NULL, "btrfs-%d",
+                                atomic_inc_return(&btrfs_bdi_num));
+        if (err)
+                return err;
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
-        bdi->state              = 0;
-        bdi->capabilities       = default_backing_dev_info.capabilities;
        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
        bdi->unplug_io_data     = info;
        bdi->congested_fn       = btrfs_congested_fn;
@@ -1569,12 +1581,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
-        atomic_set(&fs_info->throttles, 0);
-        atomic_set(&fs_info->throttle_gen, 0);
        fs_info->sb = sb;
        fs_info->max_extent = (u64)-1;
        fs_info->max_inline = 8192 * 1024;
-        setup_bdi(fs_info, &fs_info->bdi);
+        if (setup_bdi(fs_info, &fs_info->bdi))
+                goto fail_bdi;
        fs_info->btree_inode = new_inode(sb);
        fs_info->btree_inode->i_ino = 1;
        fs_info->btree_inode->i_nlink = 1;
@@ -1598,6 +1609,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
                             fs_info->btree_inode->i_mapping,
                             GFP_NOFS);
@@ -1613,10 +1625,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
        fs_info->do_barriers = 1;
-        INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
-        btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
-        btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
        BTRFS_I(fs_info->btree_inode)->root = tree_root;
        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
               sizeof(struct btrfs_key));
@@ -1631,6 +1639,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
        mutex_init(&fs_info->tree_reloc_mutex);
+        init_rwsem(&fs_info->extent_commit_sem);
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1674,6 +1683,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                goto fail_iput;
        }
+        features = btrfs_super_incompat_flags(disk_super);
+        if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+                features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+                btrfs_set_super_incompat_flags(disk_super, features);
+        }
        features = btrfs_super_compat_ro_flags(disk_super) &
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
        if (!(sb->s_flags & MS_RDONLY) && features) {
@@ -1771,7 +1786,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read the system "
                       "array on %s\n", sb->s_id);
-                goto fail_sys_array;
+                goto fail_sb_buffer;
        }
        blocksize = btrfs_level_size(tree_root,
@@ -1785,6 +1800,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                           btrfs_super_chunk_root(disk_super),
                                           blocksize, generation);
        BUG_ON(!chunk_root->node);
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+                       sb->s_id);
+                goto fail_chunk_root;
+        }
+        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+        chunk_root->commit_root = btrfs_root_node(chunk_root);
        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1810,7 +1832,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                          blocksize, generation);
        if (!tree_root->node)
                goto fail_chunk_root;
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+                       sb->s_id);
+                goto fail_tree_root;
+        }
+        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+        tree_root->commit_root = btrfs_root_node(tree_root);
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1820,14 +1848,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
-        dev_root->track_dirty = 1;
        if (ret)
                goto fail_extent_root;
+        dev_root->track_dirty = 1;
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
        if (ret)
-                goto fail_extent_root;
+                goto fail_dev_root;
        csum_root->track_dirty = 1;
@@ -1849,6 +1877,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (IS_ERR(fs_info->transaction_kthread))
                goto fail_cleaner;
+        if (!btrfs_test_opt(tree_root, SSD) &&
+            !btrfs_test_opt(tree_root, NOSSD) &&
+            !fs_info->fs_devices->rotating) {
+                printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+                       "mode\n");
+                btrfs_set_opt(fs_info->mount_opt, SSD);
+        }
        if (btrfs_super_log_root(disk_super) != 0) {
                u64 bytenr = btrfs_super_log_root(disk_super);
@@ -1881,7 +1917,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        }
        if (!(sb->s_flags & MS_RDONLY)) {
-                ret = btrfs_cleanup_reloc_trees(tree_root);
+                ret = btrfs_recover_relocation(tree_root);
                BUG_ON(ret);
        }
@@ -1892,6 +1928,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (!fs_info->fs_root)
                goto fail_trans_kthread;
        return tree_root;
 fail_trans_kthread:
@@ -1908,14 +1945,19 @@ fail_cleaner:
 fail_csum_root:
        free_extent_buffer(csum_root->node);
+        free_extent_buffer(csum_root->commit_root);
+fail_dev_root:
+        free_extent_buffer(dev_root->node);
+        free_extent_buffer(dev_root->commit_root);
 fail_extent_root:
        free_extent_buffer(extent_root->node);
+        free_extent_buffer(extent_root->commit_root);
 fail_tree_root:
        free_extent_buffer(tree_root->node);
+        free_extent_buffer(tree_root->commit_root);
 fail_chunk_root:
        free_extent_buffer(chunk_root->node);
-fail_sys_array:
+        free_extent_buffer(chunk_root->commit_root);
-        free_extent_buffer(dev_root->node);
 fail_sb_buffer:
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -1931,8 +1973,8 @@ fail_iput:
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
+fail_bdi:
        bdi_destroy(&fs_info->bdi);
 fail:
        kfree(extent_root);
        kfree(tree_root);
@@ -2005,6 +2047,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
        return latest;
 }
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1.  When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
 static int write_dev_supers(struct btrfs_device *device,
                            struct btrfs_super_block *sb,
                            int do_barriers, int wait, int max_mirrors)
@@ -2040,12 +2093,16 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh = __find_get_block(device->bdev, bytenr / 4096,
                                              BTRFS_SUPER_INFO_SIZE);
                        BUG_ON(!bh);
-                        brelse(bh);
                        wait_on_buffer(bh);
-                        if (buffer_uptodate(bh)) {
+                        if (!buffer_uptodate(bh))
-                                brelse(bh);
+                                errors++;
-                                continue;
-                        }
+                        /* drop our reference */
+                        brelse(bh);
+                        /* drop the reference from the wait == 0 run */
+                        brelse(bh);
+                        continue;
                } else {
                        btrfs_set_super_bytenr(sb, bytenr);
@@ -2056,12 +2113,18 @@ static int write_dev_supers(struct btrfs_device *device,
                                              BTRFS_CSUM_SIZE);
                        btrfs_csum_final(crc, sb->csum);
+                        /*
+                         * one reference for us, and we leave it for the
+                         * caller
+                         */
                        bh = __getblk(device->bdev, bytenr / 4096,
                                      BTRFS_SUPER_INFO_SIZE);
                        memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
-                        set_buffer_uptodate(bh);
+                        /* one reference for submit_bh */
                        get_bh(bh);
+                        set_buffer_uptodate(bh);
                        lock_buffer(bh);
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
@@ -2073,6 +2136,7 @@ static int write_dev_supers(struct btrfs_device *device,
                                       device->name);
                                set_buffer_uptodate(bh);
                                device->barriers = 0;
+                                /* one reference for submit_bh */
                                get_bh(bh);
                                lock_buffer(bh);
                                ret = submit_bh(WRITE_SYNC, bh);
@@ -2081,22 +2145,15 @@ static int write_dev_supers(struct btrfs_device *device,
                        ret = submit_bh(WRITE_SYNC, bh);
                }
-                if (!ret && wait) {
+                if (ret)
-                        wait_on_buffer(bh);
-                        if (!buffer_uptodate(bh))
-                                errors++;
-                } else if (ret) {
                        errors++;
-                }
-                if (wait)
-                        brelse(bh);
        }
        return errors < i ? 0 : -1;
 }
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-        struct list_head *head = &root->fs_info->fs_devices->devices;
+        struct list_head *head;
        struct btrfs_device *dev;
        struct btrfs_super_block *sb;
        struct btrfs_dev_item *dev_item;
@@ -2111,6 +2168,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        sb = &root->fs_info->super_for_commit;
        dev_item = &sb->dev_item;
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        head = &root->fs_info->fs_devices->devices;
        list_for_each_entry(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
@@ -2154,6 +2214,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
                if (ret)
                        total_errors++;
        }
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        if (total_errors > max_errors) {
                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
                       total_errors);
@@ -2173,6 +2234,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
+        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
        radix_tree_delete(&fs_info->fs_roots_radix,
                          (unsigned long)root->root_key.objectid);
        if (root->anon_super.s_dev) {
@@ -2219,10 +2281,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
                                             ARRAY_SIZE(gang));
                if (!ret)
                        break;
+                root_objectid = gang[ret - 1]->root_key.objectid + 1;
                for (i = 0; i < ret; i++) {
                        root_objectid = gang[i]->root_key.objectid;
                        ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                                    root_objectid, gang[i]);
+                                                    root_objectid);
                        BUG_ON(ret);
                        btrfs_orphan_cleanup(gang[i]);
                }
@@ -2269,6 +2333,9 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        fs_info->closing = 2;
+        smp_mb();
        if (fs_info->delalloc_bytes) {
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                       (unsigned long long)fs_info->delalloc_bytes);
@@ -2278,22 +2345,19 @@ int close_ctree(struct btrfs_root *root)
                       (unsigned long long)fs_info->total_ref_cache_size);
        }
-        if (fs_info->extent_root->node)
+        free_extent_buffer(fs_info->extent_root->node);
-                free_extent_buffer(fs_info->extent_root->node);
+        free_extent_buffer(fs_info->extent_root->commit_root);
+        free_extent_buffer(fs_info->tree_root->node);
-        if (fs_info->tree_root->node)
+        free_extent_buffer(fs_info->tree_root->commit_root);
-                free_extent_buffer(fs_info->tree_root->node);
+        free_extent_buffer(root->fs_info->chunk_root->node);
+        free_extent_buffer(root->fs_info->chunk_root->commit_root);
-        if (root->fs_info->chunk_root->node)
+        free_extent_buffer(root->fs_info->dev_root->node);
-                free_extent_buffer(root->fs_info->chunk_root->node);
+        free_extent_buffer(root->fs_info->dev_root->commit_root);
+        free_extent_buffer(root->fs_info->csum_root->node);
-        if (root->fs_info->dev_root->node)
+        free_extent_buffer(root->fs_info->csum_root->commit_root);
-                free_extent_buffer(root->fs_info->dev_root->node);
-        if (root->fs_info->csum_root->node)
-                free_extent_buffer(root->fs_info->csum_root->node);
        btrfs_free_block_groups(root->fs_info);
+        btrfs_free_pinned_extents(root->fs_info);
        del_fs_roots(fs_info);
@@ -2373,17 +2437,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
         * looks as though older kernels can get into trouble with
         * this code, they end up stuck in balance_dirty_pages forever
         */
-        struct extent_io_tree *tree;
        u64 num_dirty;
-        u64 start = 0;
        unsigned long thresh = 32 * 1024 * 1024;
-        tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
        if (current->flags & PF_MEMALLOC)
                return;
-        num_dirty = count_range_bits(tree, &start, (u64)-1,
+        num_dirty = root->fs_info->dirty_metadata_bytes;
-                                     thresh, EXTENT_DIRTY);
        if (num_dirty > thresh) {
                balance_dirty_pages_ratelimited_nr(
                                   root->fs_info->btree_inode->i_mapping, 1);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 85315d2c90de..9596b40caa4e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
-        inode = btrfs_iget(sb, &key, root, NULL);
+        inode = btrfs_iget(sb, &key, root);
        if (IS_ERR(inode))
                return (void *)inode;
@@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
-        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
 }
 const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 35af93355063..72a2b9c28e9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,57 +21,54 @@
 #include <linux/blkdev.h>
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
+#include <linux/kthread.h>
 #include "compat.h"
 #include "hash.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "free-space-cache.h"
-#define PENDING_EXTENT_INSERT 0
-#define PENDING_EXTENT_DELETE 1
-#define PENDING_BACKREF_UPDATE 2
-struct pending_extent_op {
-        int type;
-        u64 bytenr;
-        u64 num_bytes;
-        u64 parent;
-        u64 orig_parent;
-        u64 generation;
-        u64 orig_generation;
-        int level;
-        struct list_head list;
-        int del;
-};
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root, u64 parent,
-                                         u64 root_objectid, u64 ref_generation,
-                                         u64 owner, struct btrfs_key *ins,
-                                         int ref_mod);
 static int update_reserved_extents(struct btrfs_root *root,
                                   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc,
                              int mark_free);
-static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
+                                struct btrfs_root *root,
-                                        u64 bytenr, u64 num_bytes, u64 parent,
+                                u64 bytenr, u64 num_bytes, u64 parent,
-                                        u64 root_objectid, u64 ref_generation,
+                                u64 root_objectid, u64 owner_objectid,
-                                        u64 owner_objectid, int pin,
+                                u64 owner_offset, int refs_to_drop,
-                                        int ref_to_drop);
+                                struct btrfs_delayed_extent_op *extra_op);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+                                    struct extent_buffer *leaf,
+                                    struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      u64 parent, u64 root_objectid,
+                                      u64 flags, u64 owner, u64 offset,
+                                      struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     u64 parent, u64 root_objectid,
+                                     u64 flags, struct btrfs_disk_key *key,
+                                     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+        smp_mb();
+        return cache->cached == BTRFS_CACHE_FINISHED;
+}
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
        return (cache->flags & bits) == bits;
@@ -157,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 }
 /*
+ * We always set EXTENT_LOCKED for the super mirror extents so we don't
+ * overwrite them, so those bits need to be unset.  Also, if we are unmounting
+ * with pinned extents still sitting there because we had a block group caching,
+ * we need to clear those now, since we are done.
+ */
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+{
+        u64 start, end, last = 0;
+        int ret;
+        while (1) {
+                ret = find_first_extent_bit(&info->pinned_extents, last,
+                                            &start, &end,
+                                            EXTENT_LOCKED|EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_bits(&info->pinned_extents, start, end,
+                                  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+                last = end+1;
+        }
+}
+static int remove_sb_from_cache(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *cache)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 bytenr;
+        u64 *logical;
+        int stripe_len;
+        int i, nr, ret;
+        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                bytenr = btrfs_sb_offset(i);
+                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+                                       cache->key.objectid, bytenr,
+                                       0, &logical, &nr, &stripe_len);
+                BUG_ON(ret);
+                while (nr--) {
+                        try_lock_extent(&fs_info->pinned_extents,
+                                        logical[nr],
+                                        logical[nr] + stripe_len - 1, GFP_NOFS);
+                }
+                kfree(logical);
+        }
+        return 0;
+}
+/*
 * this is only called by cache_block_group, since we could have freed extents
 * we need to check the pinned_extents for any extents that can't be used yet
 * since their free space will be released as soon as the transaction commits.
 */
-static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_fs_info *info, u64 start, u64 end)
 {
-        u64 extent_start, extent_end, size;
+        u64 extent_start, extent_end, size, total_added = 0;
        int ret;
        while (start < end) {
                ret = find_first_extent_bit(&info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                            EXTENT_DIRTY);
+                                            EXTENT_DIRTY|EXTENT_LOCKED);
                if (ret)
                        break;
@@ -178,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
                        start = extent_end + 1;
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
+                        total_added += size;
                        ret = btrfs_add_free_space(block_group, start,
                                                   size);
                        BUG_ON(ret);
@@ -189,84 +237,93 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
        if (start < end) {
                size = end - start;
+                total_added += size;
                ret = btrfs_add_free_space(block_group, start, size);
                BUG_ON(ret);
        }
-        return 0;
+        return total_added;
-}
-static int remove_sb_from_cache(struct btrfs_root *root,
-                                struct btrfs_block_group_cache *cache)
-{
-        u64 bytenr;
-        u64 *logical;
-        int stripe_len;
-        int i, nr, ret;
-        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-                bytenr = btrfs_sb_offset(i);
-                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
-                                       cache->key.objectid, bytenr, 0,
-                                       &logical, &nr, &stripe_len);
-                BUG_ON(ret);
-                while (nr--) {
-                        btrfs_remove_free_space(cache, logical[nr],
-                                                stripe_len);
-                }
-                kfree(logical);
-        }
-        return 0;
 }
-static int cache_block_group(struct btrfs_root *root,
+static int caching_kthread(void *data)
-                             struct btrfs_block_group_cache *block_group)
 {
+        struct btrfs_block_group_cache *block_group = data;
+        struct btrfs_fs_info *fs_info = block_group->fs_info;
+        u64 last = 0;
        struct btrfs_path *path;
        int ret = 0;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        int slot;
-        u64 last;
+        u64 total_found = 0;
-        if (!block_group)
+        BUG_ON(!fs_info);
-                return 0;
-        root = root->fs_info->extent_root;
-        if (block_group->cached)
-                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->reada = 2;
+        atomic_inc(&block_group->space_info->caching_threads);
+        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
        /*
-         * we get into deadlocks with paths held by callers of this function.
+         * We don't want to deadlock with somebody trying to allocate a new
-         * since the alloc_mutex is protecting things right now, just
+         * extent for the extent root while also trying to search the extent
-         * skip the locking here
+         * root to add free space.  So we skip locking and search the commit
+         * root, since its read-only
         */
        path->skip_locking = 1;
-        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+        path->search_commit_root = 1;
+        path->reada = 2;
        key.objectid = last;
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+again:
+        /* need to make sure the commit_root doesn't disappear */
+        down_read(&fs_info->extent_commit_sem);
+        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
        while (1) {
+                smp_mb();
+                if (block_group->fs_info->closing > 1) {
+                        last = (u64)-1;
+                        break;
+                }
                leaf = path->nodes[0];
                slot = path->slots[0];
                if (slot >= btrfs_header_nritems(leaf)) {
-                        ret = btrfs_next_leaf(root, path);
+                        ret = btrfs_next_leaf(fs_info->extent_root, path);
                        if (ret < 0)
                                goto err;
-                        if (ret == 0)
+                        else if (ret)
-                                continue;
-                        else
                                break;
+                        if (need_resched() ||
+                            btrfs_transaction_in_commit(fs_info)) {
+                                leaf = path->nodes[0];
+                                /* this shouldn't happen, but if the
+                                 * leaf is empty just move on.
+                                 */
+                                if (btrfs_header_nritems(leaf) == 0)
+                                        break;
+                                /*
+                                 * we need to copy the key out so that
+                                 * we are sure the next search advances
+                                 * us forward in the btree.
+                                 */
+                                btrfs_item_key_to_cpu(leaf, &key, 0);
+                                btrfs_release_path(fs_info->extent_root, path);
+                                up_read(&fs_info->extent_commit_sem);
+                                schedule_timeout(1);
+                                goto again;
+                        }
+                        continue;
                }
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid < block_group->key.objectid)
@@ -277,24 +334,59 @@ static int cache_block_group(struct btrfs_root *root,
                        break;
                if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-                        add_new_free_space(block_group, root->fs_info, last,
+                        total_found += add_new_free_space(block_group,
-                                           key.objectid);
+                                                          fs_info, last,
+                                                          key.objectid);
                        last = key.objectid + key.offset;
                }
+                if (total_found > (1024 * 1024 * 2)) {
+                        total_found = 0;
+                        wake_up(&block_group->caching_q);
+                }
 next:
                path->slots[0]++;
        }
+        ret = 0;
-        add_new_free_space(block_group, root->fs_info, last,
+        total_found += add_new_free_space(block_group, fs_info, last,
-                           block_group->key.objectid +
+                                          block_group->key.objectid +
-                           block_group->key.offset);
+                                          block_group->key.offset);
+        spin_lock(&block_group->lock);
+        block_group->cached = BTRFS_CACHE_FINISHED;
+        spin_unlock(&block_group->lock);
-        block_group->cached = 1;
-        remove_sb_from_cache(root, block_group);
-        ret = 0;
 err:
        btrfs_free_path(path);
+        up_read(&fs_info->extent_commit_sem);
+        atomic_dec(&block_group->space_info->caching_threads);
+        wake_up(&block_group->caching_q);
+        return 0;
+}
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+        struct task_struct *tsk;
+        int ret = 0;
+        spin_lock(&cache->lock);
+        if (cache->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&cache->lock);
+                return ret;
+        }
+        cache->cached = BTRFS_CACHE_STARTED;
+        spin_unlock(&cache->lock);
+        tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+                          cache->key.objectid);
+        if (IS_ERR(tsk)) {
+                ret = PTR_ERR(tsk);
+                printk(KERN_ERR "error running thread %d\n", ret);
+                BUG();
+        }
        return ret;
 }
@@ -453,199 +545,968 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COW'd through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
 * - different files inside a single subvolume
 * - different offsets inside a file (bookend extents in file.c)
 *
- * The extent ref structure has fields for:
+ * The extent ref structure for the implicit back refs has fields for:
 *
 * - Objectid of the subvolume root
- * - Generation number of the tree holding the reference
 * - objectid of the file holding the reference
- * - number of references holding by parent node (alway 1 for tree blocks)
+ * - original offset in the file
+ * - how many bookend extents
 *
- * Btree leaf may hold multiple references to a file extent. In most cases,
+ * The key offset for the implicit back refs is hash of the first
- * these references are from same file and the corresponding offsets inside
+ * three fields.
- * the file are close together.
 *
- * When a file extent is allocated the fields are filled in:
+ * The extent ref structure for the full back refs has field for:
- *     (root_key.objectid, trans->transid, inode objectid, 1)
 *
- * When a leaf is cow'd new references are added for every file extent found
+ * - number of pointers in the tree leaf
- * in the leaf.  It looks similar to the create case, but trans->transid will
- * be different when the block is cow'd.
 *
- *     (root_key.objectid, trans->transid, inode objectid,
+ * The key offset for the implicit back refs is the first byte of
- *      number of references in the leaf)
+ * the tree leaf
 *
- * When a file extent is removed either during snapshot deletion or
+ * When a file extent is allocated, The implicit back refs is used.
- * file truncation, we find the corresponding back reference and check
+ * the fields are filled in:
- * the following fields:
 *
- *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ *     (root_key.objectid, inode objectid, offset in file, 1)
- *      inode objectid)
 *
- * Btree extents can be referenced by:
+ * When a file extent is removed file truncation, we find the
- *
+ * corresponding implicit back refs and check the following fields:
- * - Different subvolumes
- * - Different generations of the same subvolume
- *
- * When a tree block is created, back references are inserted:
- *
- * (root->root_key.objectid, trans->transid, level, 1)
- *
- * When a tree block is cow'd, new back references are added for all the
- * blocks it points to. If the tree block isn't in reference counted root,
- * the old back references are removed. These new back references are of
- * the form (trans->transid will have increased since creation):
 *
- * (root->root_key.objectid, trans->transid, level, 1)
+ *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 *
- * When a backref is in deleting, the following fields are checked:
+ * Btree extents can be referenced by:
 *
- * if backref was for a tree root:
+ * - Different subvolumes
- *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
- * else
- *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
 *
- * Back Reference Key composing:
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
 *
- * The key objectid corresponds to the first byte in the extent, the key
+ * When implicit back refs is used, information about the lowest key and
- * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * level of the tree block are required. These information are stored in
- * byte of parent extent. If a extent is tree root, the key offset is set
+ * tree block info structure.
- * to the key objectid.
 */
-static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                                          struct btrfs_root *root,
+static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
-                                          struct btrfs_path *path,
+                                  struct btrfs_root *root,
-                                          u64 bytenr, u64 parent,
+                                  struct btrfs_path *path,
-                                          u64 ref_root, u64 ref_generation,
+                                  u64 owner, u32 extra_size)
-                                          u64 owner_objectid, int del)
 {
+        struct btrfs_extent_item *item;
+        struct btrfs_extent_item_v0 *ei0;
+        struct btrfs_extent_ref_v0 *ref0;
+        struct btrfs_tree_block_info *bi;
+        struct extent_buffer *leaf;
        struct btrfs_key key;
-        struct btrfs_extent_ref *ref;
+        struct btrfs_key found_key;
+        u32 new_size = sizeof(*item);
+        u64 refs;
+        int ret;
+        leaf = path->nodes[0];
+        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                             struct btrfs_extent_item_v0);
+        refs = btrfs_extent_refs_v0(leaf, ei0);
+        if (owner == (u64)-1) {
+                while (1) {
+                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret < 0)
+                                        return ret;
+                                BUG_ON(ret > 0);
+                                leaf = path->nodes[0];
+                        }
+                        btrfs_item_key_to_cpu(leaf, &found_key,
+                                              path->slots[0]);
+                        BUG_ON(key.objectid != found_key.objectid);
+                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
+                                path->slots[0]++;
+                                continue;
+                        }
+                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                              struct btrfs_extent_ref_v0);
+                        owner = btrfs_ref_objectid_v0(leaf, ref0);
+                        break;
+                }
+        }
+        btrfs_release_path(root, path);
+        if (owner < BTRFS_FIRST_FREE_OBJECTID)
+                new_size += sizeof(*bi);
+        new_size -= sizeof(*ei0);
+        ret = btrfs_search_slot(trans, root, &key, path,
+                                new_size + extra_size, 1);
+        if (ret < 0)
+                return ret;
+        BUG_ON(ret);
+        ret = btrfs_extend_item(trans, root, path, new_size);
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        btrfs_set_extent_refs(leaf, item, refs);
+        /* FIXME: get real generation */
+        btrfs_set_extent_generation(leaf, item, 0);
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                btrfs_set_extent_flags(leaf, item,
+                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
+                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
+                bi = (struct btrfs_tree_block_info *)(item + 1);
+                /* FIXME: get first key of the block */
+                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+                btrfs_set_tree_block_level(leaf, bi, (int)owner);
+        } else {
+                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+#endif
+static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+        u32 high_crc = ~(u32)0;
+        u32 low_crc = ~(u32)0;
+        __le64 lenum;
+        lenum = cpu_to_le64(root_objectid);
+        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+        lenum = cpu_to_le64(owner);
+        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+        lenum = cpu_to_le64(offset);
+        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+        return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+                                     struct btrfs_extent_data_ref *ref)
+{
+        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+                                    btrfs_extent_data_ref_objectid(leaf, ref),
+                                    btrfs_extent_data_ref_offset(leaf, ref));
+}
+static int match_extent_data_ref(struct extent_buffer *leaf,
+                                 struct btrfs_extent_data_ref *ref,
+                                 u64 root_objectid, u64 owner, u64 offset)
+{
+        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+            btrfs_extent_data_ref_offset(leaf, ref) != offset)
+                return 0;
+        return 1;
+}
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+                                           struct btrfs_root *root,
+                                           struct btrfs_path *path,
+                                           u64 bytenr, u64 parent,
+                                           u64 root_objectid,
+                                           u64 owner, u64 offset)
+{
+        struct btrfs_key key;
+        struct btrfs_extent_data_ref *ref;
        struct extent_buffer *leaf;
-        u64 ref_objectid;
+        u32 nritems;
        int ret;
+        int recow;
+        int err = -ENOENT;
        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_REF_KEY;
+        if (parent) {
-        key.offset = parent;
+                key.type = BTRFS_SHARED_DATA_REF_KEY;
+                key.offset = parent;
+        } else {
+                key.type = BTRFS_EXTENT_DATA_REF_KEY;
+                key.offset = hash_extent_data_ref(root_objectid,
+                                                  owner, offset);
+        }
+again:
+        recow = 0;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0) {
+                err = ret;
+                goto fail;
+        }
-        ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+        if (parent) {
-        if (ret < 0)
+                if (!ret)
-                goto out;
+                        return 0;
-        if (ret > 0) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                ret = -ENOENT;
+                key.type = BTRFS_EXTENT_REF_V0_KEY;
-                goto out;
+                btrfs_release_path(root, path);
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto fail;
+                }
+                if (!ret)
+                        return 0;
+#endif
+                goto fail;
        }
        leaf = path->nodes[0];
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        nritems = btrfs_header_nritems(leaf);
-        ref_objectid = btrfs_ref_objectid(leaf, ref);
+        while (1) {
-        if (btrfs_ref_root(leaf, ref) != ref_root ||
+                if (path->slots[0] >= nritems) {
-            btrfs_ref_generation(leaf, ref) != ref_generation ||
+                        ret = btrfs_next_leaf(root, path);
-            (ref_objectid != owner_objectid &&
+                        if (ret < 0)
-             ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+                                err = ret;
-                ret = -EIO;
+                        if (ret)
-                WARN_ON(1);
+                                goto fail;
-                goto out;
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        recow = 1;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != bytenr ||
+                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
+                        goto fail;
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_data_ref);
+                if (match_extent_data_ref(leaf, ref, root_objectid,
+                                          owner, offset)) {
+                        if (recow) {
+                                btrfs_release_path(root, path);
+                                goto again;
+                        }
+                        err = 0;
+                        break;
+                }
+                path->slots[0]++;
        }
-        ret = 0;
+fail:
-out:
+        return err;
-        return ret;
 }
-static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root,
+                                           struct btrfs_root *root,
-                                          struct btrfs_path *path,
+                                           struct btrfs_path *path,
-                                          u64 bytenr, u64 parent,
+                                           u64 bytenr, u64 parent,
-                                          u64 ref_root, u64 ref_generation,
+                                           u64 root_objectid, u64 owner,
-                                          u64 owner_objectid,
+                                           u64 offset, int refs_to_add)
-                                          int refs_to_add)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
-        struct btrfs_extent_ref *ref;
+        u32 size;
        u32 num_refs;
        int ret;
        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_REF_KEY;
+        if (parent) {
-        key.offset = parent;
+                key.type = BTRFS_SHARED_DATA_REF_KEY;
+                key.offset = parent;
+                size = sizeof(struct btrfs_shared_data_ref);
+        } else {
+                key.type = BTRFS_EXTENT_DATA_REF_KEY;
+                key.offset = hash_extent_data_ref(root_objectid,
+                                                  owner, offset);
+                size = sizeof(struct btrfs_extent_data_ref);
+        }
-        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
-        if (ret == 0) {
+        if (ret && ret != -EEXIST)
-                leaf = path->nodes[0];
+                goto fail;
-                ref = btrfs_item_ptr(leaf, path->slots[0],
-                                     struct btrfs_extent_ref);
+        leaf = path->nodes[0];
-                btrfs_set_ref_root(leaf, ref, ref_root);
+        if (parent) {
-                btrfs_set_ref_generation(leaf, ref, ref_generation);
+                struct btrfs_shared_data_ref *ref;
-                btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-                btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
-        } else if (ret == -EEXIST) {
-                u64 existing_owner;
-                BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
-                leaf = path->nodes[0];
                ref = btrfs_item_ptr(leaf, path->slots[0],
-                                     struct btrfs_extent_ref);
+                                     struct btrfs_shared_data_ref);
-                if (btrfs_ref_root(leaf, ref) != ref_root ||
+                if (ret == 0) {
-                    btrfs_ref_generation(leaf, ref) != ref_generation) {
+                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
-                        ret = -EIO;
+                } else {
-                        WARN_ON(1);
+                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
-                        goto out;
+                        num_refs += refs_to_add;
+                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
                }
+        } else {
+                struct btrfs_extent_data_ref *ref;
+                while (ret == -EEXIST) {
+                        ref = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_data_ref);
+                        if (match_extent_data_ref(leaf, ref, root_objectid,
+                                                  owner, offset))
+                                break;
+                        btrfs_release_path(root, path);
+                        key.offset++;
+                        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                                      size);
+                        if (ret && ret != -EEXIST)
+                                goto fail;
-                num_refs = btrfs_ref_num_refs(leaf, ref);
+                        leaf = path->nodes[0];
-                BUG_ON(num_refs == 0);
+                }
-                btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_data_ref);
-                existing_owner = btrfs_ref_objectid(leaf, ref);
+                if (ret == 0) {
-                if (existing_owner != owner_objectid &&
+                        btrfs_set_extent_data_ref_root(leaf, ref,
-                    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+                                                       root_objectid);
-                        btrfs_set_ref_objectid(leaf, ref,
+                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
-                                        BTRFS_MULTIPLE_OBJECTIDS);
+                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+                } else {
+                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
+                        num_refs += refs_to_add;
+                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
                }
-                ret = 0;
-        } else {
-                goto out;
        }
-        btrfs_unlock_up_safe(path, 1);
+        btrfs_mark_buffer_dirty(leaf);
-        btrfs_mark_buffer_dirty(path->nodes[0]);
+        ret = 0;
-out:
+fail:
        btrfs_release_path(root, path);
        return ret;
 }
-static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root,
+                                           struct btrfs_root *root,
-                                          struct btrfs_path *path,
+                                           struct btrfs_path *path,
-                                          int refs_to_drop)
+                                           int refs_to_drop)
 {
+        struct btrfs_key key;
+        struct btrfs_extent_data_ref *ref1 = NULL;
+        struct btrfs_shared_data_ref *ref2 = NULL;
        struct extent_buffer *leaf;
-        struct btrfs_extent_ref *ref;
+        u32 num_refs = 0;
-        u32 num_refs;
        int ret = 0;
        leaf = path->nodes[0];
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        num_refs = btrfs_ref_num_refs(leaf, ref);
+        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                ref1 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_data_ref);
+                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                ref2 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_shared_data_ref);
+                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                struct btrfs_extent_ref_v0 *ref0;
+                ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_ref_v0);
+                num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+        } else {
+                BUG();
+        }
        BUG_ON(num_refs < refs_to_drop);
        num_refs -= refs_to_drop;
        if (num_refs == 0) {
                ret = btrfs_del_item(trans, root, path);
        } else {
-                btrfs_set_ref_num_refs(leaf, ref, num_refs);
+                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                else {
+                        struct btrfs_extent_ref_v0 *ref0;
+                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_extent_ref_v0);
+                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
+                }
+#endif
                btrfs_mark_buffer_dirty(leaf);
        }
+        return ret;
+}
+static noinline u32 extent_data_ref_count(struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          struct btrfs_extent_inline_ref *iref)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_data_ref *ref1;
+        struct btrfs_shared_data_ref *ref2;
+        u32 num_refs = 0;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        if (iref) {
+                if (btrfs_extent_inline_ref_type(leaf, iref) ==
+                    BTRFS_EXTENT_DATA_REF_KEY) {
+                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+                } else {
+                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+                }
+        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                ref1 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_data_ref);
+                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                ref2 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_shared_data_ref);
+                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                struct btrfs_extent_ref_v0 *ref0;
+                ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_ref_v0);
+                num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+        } else {
+                WARN_ON(1);
+        }
+        return num_refs;
+}
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 root_objectid)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = bytenr;
+        if (parent) {
+                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+                key.offset = parent;
+        } else {
+                key.type = BTRFS_TREE_BLOCK_REF_KEY;
+                key.offset = root_objectid;
+        }
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0)
+                ret = -ENOENT;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (ret == -ENOENT && parent) {
+                btrfs_release_path(root, path);
+                key.type = BTRFS_EXTENT_REF_V0_KEY;
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret > 0)
+                        ret = -ENOENT;
+        }
+#endif
+        return ret;
+}
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 root_objectid)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = bytenr;
+        if (parent) {
+                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+                key.offset = parent;
+        } else {
+                key.type = BTRFS_TREE_BLOCK_REF_KEY;
+                key.offset = root_objectid;
+        }
+        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
        btrfs_release_path(root, path);
        return ret;
 }
+static inline int extent_ref_type(u64 parent, u64 owner)
+{
+        int type;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                if (parent > 0)
+                        type = BTRFS_SHARED_BLOCK_REF_KEY;
+                else
+                        type = BTRFS_TREE_BLOCK_REF_KEY;
+        } else {
+                if (parent > 0)
+                        type = BTRFS_SHARED_DATA_REF_KEY;
+                else
+                        type = BTRFS_EXTENT_DATA_REF_KEY;
+        }
+        return type;
+}
+static int find_next_key(struct btrfs_path *path, int level,
+                         struct btrfs_key *key)
+{
+        for (; level < BTRFS_MAX_LEVEL; level++) {
+                if (!path->nodes[level])
+                        break;
+                if (path->slots[level] + 1 >=
+                    btrfs_header_nritems(path->nodes[level]))
+                        continue;
+                if (level == 0)
+                        btrfs_item_key_to_cpu(path->nodes[level], key,
+                                              path->slots[level] + 1);
+                else
+                        btrfs_node_key_to_cpu(path->nodes[level], key,
+                                              path->slots[level] + 1);
+                return 0;
+        }
+        return 1;
+}
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ *       items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref **ref_ret,
+                                 u64 bytenr, u64 num_bytes,
+                                 u64 parent, u64 root_objectid,
+                                 u64 owner, u64 offset, int insert)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *iref;
+        u64 flags;
+        u64 item_size;
+        unsigned long ptr;
+        unsigned long end;
+        int extra_size;
+        int type;
+        int want;
+        int ret;
+        int err = 0;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = num_bytes;
+        want = extent_ref_type(parent, owner);
+        if (insert) {
+                extra_size = btrfs_extent_inline_ref_size(want);
+                path->keep_locks = 1;
+        } else
+                extra_size = -1;
+        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                if (!insert) {
+                        err = -ENOENT;
+                        goto out;
+                }
+                ret = convert_extent_item_v0(trans, root, path, owner,
+                                             extra_size);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        }
+#endif
+        BUG_ON(item_size < sizeof(*ei));
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        flags = btrfs_extent_flags(leaf, ei);
+        ptr = (unsigned long)(ei + 1);
+        end = (unsigned long)ei + item_size;
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                ptr += sizeof(struct btrfs_tree_block_info);
+                BUG_ON(ptr > end);
+        } else {
+                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+        }
+        err = -ENOENT;
+        while (1) {
+                if (ptr >= end) {
+                        WARN_ON(ptr > end);
+                        break;
+                }
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                type = btrfs_extent_inline_ref_type(leaf, iref);
+                if (want < type)
+                        break;
+                if (want > type) {
+                        ptr += btrfs_extent_inline_ref_size(type);
+                        continue;
+                }
+                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        struct btrfs_extent_data_ref *dref;
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        if (match_extent_data_ref(leaf, dref, root_objectid,
+                                                  owner, offset)) {
+                                err = 0;
+                                break;
+                        }
+                        if (hash_extent_data_ref_item(leaf, dref) <
+                            hash_extent_data_ref(root_objectid, owner, offset))
+                                break;
+                } else {
+                        u64 ref_offset;
+                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+                        if (parent > 0) {
+                                if (parent == ref_offset) {
+                                        err = 0;
+                                        break;
+                                }
+                                if (ref_offset < parent)
+                                        break;
+                        } else {
+                                if (root_objectid == ref_offset) {
+                                        err = 0;
+                                        break;
+                                }
+                                if (ref_offset < root_objectid)
+                                        break;
+                        }
+                }
+                ptr += btrfs_extent_inline_ref_size(type);
+        }
+        if (err == -ENOENT && insert) {
+                if (item_size + extra_size >=
+                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+                        err = -EAGAIN;
+                        goto out;
+                }
+                /*
+                 * To add new inline back ref, we have to make sure
+                 * there is no corresponding back ref item.
+                 * For simplicity, we just do not add new inline back
+                 * ref if there is any kind of item for this block
+                 */
+                if (find_next_key(path, 0, &key) == 0 &&
+                    key.objectid == bytenr &&
+                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+                        err = -EAGAIN;
+                        goto out;
+                }
+        }
+        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+        if (insert) {
+                path->keep_locks = 0;
+                btrfs_unlock_up_safe(path, 1);
+        }
+        return err;
+}
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct btrfs_extent_inline_ref *iref,
+                                u64 parent, u64 root_objectid,
+                                u64 owner, u64 offset, int refs_to_add,
+                                struct btrfs_delayed_extent_op *extent_op)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        unsigned long ptr;
+        unsigned long end;
+        unsigned long item_offset;
+        u64 refs;
+        int size;
+        int type;
+        int ret;
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        item_offset = (unsigned long)iref - (unsigned long)ei;
+        type = extent_ref_type(parent, owner);
+        size = btrfs_extent_inline_ref_size(type);
+        ret = btrfs_extend_item(trans, root, path, size);
+        BUG_ON(ret);
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        refs += refs_to_add;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        if (extent_op)
+                __run_delayed_extent_op(extent_op, leaf, ei);
+        ptr = (unsigned long)ei + item_offset;
+        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+        if (ptr < end - size)
+                memmove_extent_buffer(leaf, ptr + size, ptr,
+                                      end - size - ptr);
+        iref = (struct btrfs_extent_inline_ref *)ptr;
+        btrfs_set_extent_inline_ref_type(leaf, iref, type);
+        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                struct btrfs_extent_data_ref *dref;
+                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+                struct btrfs_shared_data_ref *sref;
+                sref = (struct btrfs_shared_data_ref *)(iref + 1);
+                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+        } else {
+                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref **ref_ret,
+                                 u64 bytenr, u64 num_bytes, u64 parent,
+                                 u64 root_objectid, u64 owner, u64 offset)
+{
+        int ret;
+        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
+                                           bytenr, num_bytes, parent,
+                                           root_objectid, owner, offset, 0);
+        if (ret != -ENOENT)
+                return ret;
+        btrfs_release_path(root, path);
+        *ref_ret = NULL;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
+                                            root_objectid);
+        } else {
+                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
+                                             root_objectid, owner, offset);
+        }
+        return ret;
+}
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+int update_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref *iref,
+                                 int refs_to_mod,
+                                 struct btrfs_delayed_extent_op *extent_op)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_data_ref *dref = NULL;
+        struct btrfs_shared_data_ref *sref = NULL;
+        unsigned long ptr;
+        unsigned long end;
+        u32 item_size;
+        int size;
+        int type;
+        int ret;
+        u64 refs;
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+        refs += refs_to_mod;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        if (extent_op)
+                __run_delayed_extent_op(extent_op, leaf, ei);
+        type = btrfs_extent_inline_ref_type(leaf, iref);
+        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                refs = btrfs_extent_data_ref_count(leaf, dref);
+        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+                sref = (struct btrfs_shared_data_ref *)(iref + 1);
+                refs = btrfs_shared_data_ref_count(leaf, sref);
+        } else {
+                refs = 1;
+                BUG_ON(refs_to_mod != -1);
+        }
+        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+        refs += refs_to_mod;
+        if (refs > 0) {
+                if (type == BTRFS_EXTENT_DATA_REF_KEY)
+                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
+                else
+                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
+        } else {
+                size =  btrfs_extent_inline_ref_size(type);
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                ptr = (unsigned long)iref;
+                end = (unsigned long)ei + item_size;
+                if (ptr + size < end)
+                        memmove_extent_buffer(leaf, ptr, ptr + size,
+                                              end - ptr - size);
+                item_size -= size;
+                ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+                BUG_ON(ret);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 u64 bytenr, u64 num_bytes, u64 parent,
+                                 u64 root_objectid, u64 owner,
+                                 u64 offset, int refs_to_add,
+                                 struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_extent_inline_ref *iref;
+        int ret;
+        ret = lookup_inline_extent_backref(trans, root, path, &iref,
+                                           bytenr, num_bytes, parent,
+                                           root_objectid, owner, offset, 1);
+        if (ret == 0) {
+                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+                ret = update_inline_extent_backref(trans, root, path, iref,
+                                                   refs_to_add, extent_op);
+        } else if (ret == -ENOENT) {
+                ret = setup_inline_extent_backref(trans, root, path, iref,
+                                                  parent, root_objectid,
+                                                  owner, offset, refs_to_add,
+                                                  extent_op);
+        }
+        return ret;
+}
+static int insert_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 u64 bytenr, u64 parent, u64 root_objectid,
+                                 u64 owner, u64 offset, int refs_to_add)
+{
+        int ret;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                BUG_ON(refs_to_add != 1);
+                ret = insert_tree_block_ref(trans, root, path, bytenr,
+                                            parent, root_objectid);
+        } else {
+                ret = insert_extent_data_ref(trans, root, path, bytenr,
+                                             parent, root_objectid,
+                                             owner, offset, refs_to_add);
+        }
+        return ret;
+}
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref *iref,
+                                 int refs_to_drop, int is_data)
+{
+        int ret;
+        BUG_ON(!is_data && refs_to_drop != 1);
+        if (iref) {
+                ret = update_inline_extent_backref(trans, root, path, iref,
+                                                   -refs_to_drop, NULL);
+        } else if (is_data) {
+                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+        } else {
+                ret = btrfs_del_item(trans, root, path);
+        }
+        return ret;
+}
 #ifdef BIO_RW_DISCARD
 static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
@@ -686,71 +1547,40 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
-static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root, u64 bytenr,
+                         struct btrfs_root *root,
-                                     u64 num_bytes,
+                         u64 bytenr, u64 num_bytes, u64 parent,
-                                     u64 orig_parent, u64 parent,
+                         u64 root_objectid, u64 owner, u64 offset)
-                                     u64 orig_root, u64 ref_root,
-                                     u64 orig_generation, u64 ref_generation,
-                                     u64 owner_objectid)
 {
        int ret;
-        int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
+        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
+               root_objectid == BTRFS_TREE_LOG_OBJECTID);
-        ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-                                       orig_parent, parent, orig_root,
+                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
-                                       ref_root, orig_generation,
+                                        parent, root_objectid, (int)owner,
-                                       ref_generation, owner_objectid, pin);
+                                        BTRFS_ADD_DELAYED_REF, NULL);
-        BUG_ON(ret);
+        } else {
+                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+                                        parent, root_objectid, owner, offset,
+                                        BTRFS_ADD_DELAYED_REF, NULL);
+        }
        return ret;
 }
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u64 orig_parent, u64 parent,
-                            u64 ref_root, u64 ref_generation,
-                            u64 owner_objectid)
-{
-        int ret;
-        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
-            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-                return 0;
-        ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-                                        orig_parent, parent, ref_root,
-                                        ref_root, ref_generation,
-                                        ref_generation, owner_objectid);
-        return ret;
-}
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-                                  struct btrfs_root *root, u64 bytenr,
+                                  struct btrfs_root *root,
-                                  u64 num_bytes,
+                                  u64 bytenr, u64 num_bytes,
-                                  u64 orig_parent, u64 parent,
+                                  u64 parent, u64 root_objectid,
-                                  u64 orig_root, u64 ref_root,
+                                  u64 owner, u64 offset, int refs_to_add,
-                                  u64 orig_generation, u64 ref_generation,
+                                  struct btrfs_delayed_extent_op *extent_op)
-                                  u64 owner_objectid)
-{
-        int ret;
-        ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
-                                    ref_generation, owner_objectid,
-                                    BTRFS_ADD_DELAYED_REF, 0);
-        BUG_ON(ret);
-        return ret;
-}
-static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 bytenr,
-                          u64 num_bytes, u64 parent, u64 ref_root,
-                          u64 ref_generation, u64 owner_objectid,
-                          int refs_to_add)
 {
        struct btrfs_path *path;
-        int ret;
+        struct extent_buffer *leaf;
-        struct btrfs_key key;
-        struct extent_buffer *l;
        struct btrfs_extent_item *item;
-        u32 refs;
+        u64 refs;
+        int ret;
+        int err = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -758,43 +1588,27 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
        path->reada = 1;
        path->leave_spinning = 1;
-        key.objectid = bytenr;
+        /* this will setup the path even if it fails to insert the back ref */
-        key.type = BTRFS_EXTENT_ITEM_KEY;
+        ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
-        key.offset = num_bytes;
+                                           path, bytenr, num_bytes, parent,
+                                           root_objectid, owner, offset,
-        /* first find the extent item and update its reference count */
+                                           refs_to_add, extent_op);
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+        if (ret == 0)
-                                path, 0, 1);
+                goto out;
-        if (ret < 0) {
-                btrfs_set_path_blocking(path);
-                return ret;
-        }
-        if (ret > 0) {
-                WARN_ON(1);
-                btrfs_free_path(path);
-                return -EIO;
-        }
-        l = path->nodes[0];
-        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+        if (ret != -EAGAIN) {
-        if (key.objectid != bytenr) {
+                err = ret;
-                btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+                goto out;
-                printk(KERN_ERR "btrfs wanted %llu found %llu\n",
-                       (unsigned long long)bytenr,
-                       (unsigned long long)key.objectid);
-                BUG();
        }
-        BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
-        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-        refs = btrfs_extent_refs(l, item);
+        leaf = path->nodes[0];
-        btrfs_set_extent_refs(l, item, refs + refs_to_add);
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-        btrfs_unlock_up_safe(path, 1);
+        refs = btrfs_extent_refs(leaf, item);
+        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
-        btrfs_mark_buffer_dirty(path->nodes[0]);
+        if (extent_op)
+                __run_delayed_extent_op(extent_op, leaf, item);
+        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(root->fs_info->extent_root, path);
        path->reada = 1;
@@ -802,56 +1616,197 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
        /* now insert the actual backref */
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
-                                    path, bytenr, parent,
+                                    path, bytenr, parent, root_objectid,
-                                    ref_root, ref_generation,
+                                    owner, offset, refs_to_add);
-                                    owner_objectid, refs_to_add);
        BUG_ON(ret);
+out:
        btrfs_free_path(path);
-        return 0;
+        return err;
 }
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root,
+                                struct btrfs_root *root,
-                         u64 bytenr, u64 num_bytes, u64 parent,
+                                struct btrfs_delayed_ref_node *node,
-                         u64 ref_root, u64 ref_generation,
+                                struct btrfs_delayed_extent_op *extent_op,
-                         u64 owner_objectid)
+                                int insert_reserved)
 {
-        int ret;
+        int ret = 0;
-        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+        struct btrfs_delayed_data_ref *ref;
-            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+        struct btrfs_key ins;
-                return 0;
+        u64 parent = 0;
+        u64 ref_root = 0;
+        u64 flags = 0;
-        ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
+        ins.objectid = node->bytenr;
-                                     0, ref_root, 0, ref_generation,
+        ins.offset = node->num_bytes;
-                                     owner_objectid);
+        ins.type = BTRFS_EXTENT_ITEM_KEY;
+        ref = btrfs_delayed_node_to_data_ref(node);
+        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+                parent = ref->parent;
+        else
+                ref_root = ref->root;
+        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+                if (extent_op) {
+                        BUG_ON(extent_op->update_key);
+                        flags |= extent_op->flags_to_set;
+                }
+                ret = alloc_reserved_file_extent(trans, root,
+                                                 parent, ref_root, flags,
+                                                 ref->objectid, ref->offset,
+                                                 &ins, node->ref_mod);
+                update_reserved_extents(root, ins.objectid, ins.offset, 0);
+        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+                                             node->num_bytes, parent,
+                                             ref_root, ref->objectid,
+                                             ref->offset, node->ref_mod,
+                                             extent_op);
+        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+                ret = __btrfs_free_extent(trans, root, node->bytenr,
+                                          node->num_bytes, parent,
+                                          ref_root, ref->objectid,
+                                          ref->offset, node->ref_mod,
+                                          extent_op);
+        } else {
+                BUG();
+        }
        return ret;
 }
-static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
-                                        struct btrfs_root *root,
+                                    struct extent_buffer *leaf,
-                                        struct btrfs_delayed_ref_node *node)
+                                    struct btrfs_extent_item *ei)
+{
+        u64 flags = btrfs_extent_flags(leaf, ei);
+        if (extent_op->update_flags) {
+                flags |= extent_op->flags_to_set;
+                btrfs_set_extent_flags(leaf, ei, flags);
+        }
+        if (extent_op->update_key) {
+                struct btrfs_tree_block_info *bi;
+                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+                bi = (struct btrfs_tree_block_info *)(ei + 1);
+                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+        }
+}
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_delayed_ref_node *node,
+                                 struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
+        u32 item_size;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = node->bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = node->num_bytes;
+        path->reada = 1;
+        path->leave_spinning = 1;
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+                                path, 0, 1);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret > 0) {
+                err = -EIO;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+                                             path, (u64)-1, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        }
+#endif
+        BUG_ON(item_size < sizeof(*ei));
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        __run_delayed_extent_op(extent_op, leaf, ei);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        return err;
+}
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_delayed_ref_node *node,
+                                struct btrfs_delayed_extent_op *extent_op,
+                                int insert_reserved)
 {
        int ret = 0;
-        struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+        struct btrfs_delayed_tree_ref *ref;
+        struct btrfs_key ins;
+        u64 parent = 0;
+        u64 ref_root = 0;
-        BUG_ON(node->ref_mod == 0);
+        ins.objectid = node->bytenr;
-        ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+        ins.offset = node->num_bytes;
-                                  node->parent, ref->root, ref->generation,
+        ins.type = BTRFS_EXTENT_ITEM_KEY;
-                                  ref->owner_objectid, ref->pin, node->ref_mod);
+        ref = btrfs_delayed_node_to_tree_ref(node);
+        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+                parent = ref->parent;
+        else
+                ref_root = ref->root;
+        BUG_ON(node->ref_mod != 1);
+        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+                BUG_ON(!extent_op || !extent_op->update_flags ||
+                       !extent_op->update_key);
+                ret = alloc_reserved_tree_block(trans, root,
+                                                parent, ref_root,
+                                                extent_op->flags_to_set,
+                                                &extent_op->key,
+                                                ref->level, &ins);
+                update_reserved_extents(root, ins.objectid, ins.offset, 0);
+        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+                                             node->num_bytes, parent, ref_root,
+                                             ref->level, 0, 1, extent_op);
+        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+                ret = __btrfs_free_extent(trans, root, node->bytenr,
+                                          node->num_bytes, parent, ref_root,
+                                          ref->level, 0, 1, extent_op);
+        } else {
+                BUG();
+        }
        return ret;
 }
 /* helper function to actually process a single delayed ref entry */
-static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
+                               struct btrfs_root *root,
-                                        struct btrfs_delayed_ref_node *node,
+                               struct btrfs_delayed_ref_node *node,
-                                        int insert_reserved)
+                               struct btrfs_delayed_extent_op *extent_op,
+                               int insert_reserved)
 {
        int ret;
-        struct btrfs_delayed_ref *ref;
+        if (btrfs_delayed_ref_is_head(node)) {
-        if (node->parent == (u64)-1) {
                struct btrfs_delayed_ref_head *head;
                /*
                 * we've hit the end of the chain and we were supposed
@@ -859,44 +1814,35 @@ static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                 * deleted before we ever needed to insert it, so all
                 * we have to do is clean up the accounting
                 */
+                BUG_ON(extent_op);
+                head = btrfs_delayed_node_to_head(node);
                if (insert_reserved) {
+                        if (head->is_data) {
+                                ret = btrfs_del_csums(trans, root,
+                                                      node->bytenr,
+                                                      node->num_bytes);
+                                BUG_ON(ret);
+                        }
+                        btrfs_update_pinned_extents(root, node->bytenr,
+                                                    node->num_bytes, 1);
                        update_reserved_extents(root, node->bytenr,
                                                node->num_bytes, 0);
                }
-                head = btrfs_delayed_node_to_head(node);
                mutex_unlock(&head->mutex);
                return 0;
        }
-        ref = btrfs_delayed_node_to_ref(node);
+        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
-        if (ref->action == BTRFS_ADD_DELAYED_REF) {
+            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
-                if (insert_reserved) {
+                ret = run_delayed_tree_ref(trans, root, node, extent_op,
-                        struct btrfs_key ins;
+                                           insert_reserved);
+        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
-                        ins.objectid = node->bytenr;
+                 node->type == BTRFS_SHARED_DATA_REF_KEY)
-                        ins.offset = node->num_bytes;
+                ret = run_delayed_data_ref(trans, root, node, extent_op,
-                        ins.type = BTRFS_EXTENT_ITEM_KEY;
+                                           insert_reserved);
+        else
-                        /* record the full extent allocation */
+                BUG();
-                        ret = __btrfs_alloc_reserved_extent(trans, root,
+        return ret;
-                                        node->parent, ref->root,
-                                        ref->generation, ref->owner_objectid,
-                                        &ins, node->ref_mod);
-                        update_reserved_extents(root, node->bytenr,
-                                                node->num_bytes, 0);
-                } else {
-                        /* just add one backref */
-                        ret = add_extent_ref(trans, root, node->bytenr,
-                                     node->num_bytes,
-                                     node->parent, ref->root, ref->generation,
-                                     ref->owner_objectid, node->ref_mod);
-                }
-                BUG_ON(ret);
-        } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
-                WARN_ON(insert_reserved);
-                ret = drop_delayed_ref(trans, root, node);
-        }
-        return 0;
 }
 static noinline struct btrfs_delayed_ref_node *
@@ -919,7 +1865,7 @@ again:
                                rb_node);
                if (ref->bytenr != head->node.bytenr)
                        break;
-                if (btrfs_delayed_node_to_ref(ref)->action == action)
+                if (ref->action == action)
                        return ref;
                node = rb_prev(node);
        }
@@ -937,6 +1883,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
+        struct btrfs_delayed_extent_op *extent_op;
        int ret;
        int count = 0;
        int must_insert_reserved = 0;
@@ -975,6 +1922,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                must_insert_reserved = locked_ref->must_insert_reserved;
                locked_ref->must_insert_reserved = 0;
+                extent_op = locked_ref->extent_op;
+                locked_ref->extent_op = NULL;
                /*
                 * locked_ref is the head node, so we have to go one
                 * node back for any delayed ref updates
@@ -986,6 +1936,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                         * so that any accounting fixes can happen
                         */
                        ref = &locked_ref->node;
+                        if (extent_op && must_insert_reserved) {
+                                kfree(extent_op);
+                                extent_op = NULL;
+                        }
+                        if (extent_op) {
+                                spin_unlock(&delayed_refs->lock);
+                                ret = run_delayed_extent_op(trans, root,
+                                                            ref, extent_op);
+                                BUG_ON(ret);
+                                kfree(extent_op);
+                                cond_resched();
+                                spin_lock(&delayed_refs->lock);
+                                continue;
+                        }
                        list_del_init(&locked_ref->cluster);
                        locked_ref = NULL;
                }
@@ -993,14 +1962,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
                spin_unlock(&delayed_refs->lock);
-                ret = run_one_delayed_ref(trans, root, ref,
+                ret = run_one_delayed_ref(trans, root, ref, extent_op,
                                          must_insert_reserved);
                BUG_ON(ret);
-                btrfs_put_delayed_ref(ref);
+                btrfs_put_delayed_ref(ref);
+                kfree(extent_op);
                count++;
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
@@ -1095,25 +2067,112 @@ out:
        return 0;
 }
-int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 objectid, u64 bytenr)
+                                struct btrfs_root *root,
+                                u64 bytenr, u64 num_bytes, u64 flags,
+                                int is_data)
+{
+        struct btrfs_delayed_extent_op *extent_op;
+        int ret;
+        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+        if (!extent_op)
+                return -ENOMEM;
+        extent_op->flags_to_set = flags;
+        extent_op->update_flags = 1;
+        extent_op->update_key = 0;
+        extent_op->is_data = is_data ? 1 : 0;
+        ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+        if (ret)
+                kfree(extent_op);
+        return ret;
+}
+static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      u64 objectid, u64 offset, u64 bytenr)
+{
+        struct btrfs_delayed_ref_head *head;
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_data_ref *data_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct rb_node *node;
+        int ret = 0;
+        ret = -ENOENT;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (!head)
+                goto out;
+        if (!mutex_trylock(&head->mutex)) {
+                atomic_inc(&head->node.refs);
+                spin_unlock(&delayed_refs->lock);
+                btrfs_release_path(root->fs_info->extent_root, path);
+                mutex_lock(&head->mutex);
+                mutex_unlock(&head->mutex);
+                btrfs_put_delayed_ref(&head->node);
+                return -EAGAIN;
+        }
+        node = rb_prev(&head->node.rb_node);
+        if (!node)
+                goto out_unlock;
+        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+        if (ref->bytenr != bytenr)
+                goto out_unlock;
+        ret = 1;
+        if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
+                goto out_unlock;
+        data_ref = btrfs_delayed_node_to_data_ref(ref);
+        node = rb_prev(node);
+        if (node) {
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                if (ref->bytenr == bytenr)
+                        goto out_unlock;
+        }
+        if (data_ref->root != root->root_key.objectid ||
+            data_ref->objectid != objectid || data_ref->offset != offset)
+                goto out_unlock;
+        ret = 0;
+out_unlock:
+        mutex_unlock(&head->mutex);
+out:
+        spin_unlock(&delayed_refs->lock);
+        return ret;
+}
+static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_path *path,
+                                        u64 objectid, u64 offset, u64 bytenr)
 {
        struct btrfs_root *extent_root = root->fs_info->extent_root;
-        struct btrfs_path *path;
        struct extent_buffer *leaf;
-        struct btrfs_extent_ref *ref_item;
+        struct btrfs_extent_data_ref *ref;
+        struct btrfs_extent_inline_ref *iref;
+        struct btrfs_extent_item *ei;
        struct btrfs_key key;
-        struct btrfs_key found_key;
+        u32 item_size;
-        u64 ref_root;
-        u64 last_snapshot;
-        u32 nritems;
        int ret;
        key.objectid = bytenr;
        key.offset = (u64)-1;
        key.type = BTRFS_EXTENT_ITEM_KEY;
-        path = btrfs_alloc_path();
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
@@ -1125,55 +2184,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
        path->slots[0]--;
        leaf = path->nodes[0];
-        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        if (found_key.objectid != bytenr ||
+        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
-            found_key.type != BTRFS_EXTENT_ITEM_KEY)
                goto out;
-        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        ret = 1;
-        while (1) {
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                leaf = path->nodes[0];
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                nritems = btrfs_header_nritems(leaf);
+        if (item_size < sizeof(*ei)) {
-                if (path->slots[0] >= nritems) {
+                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
-                        ret = btrfs_next_leaf(extent_root, path);
+                goto out;
-                        if (ret < 0)
+        }
-                                goto out;
+#endif
-                        if (ret == 0)
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-                                continue;
-                        break;
-                }
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid != bytenr)
-                        break;
-                if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+        if (item_size != sizeof(*ei) +
-                        path->slots[0]++;
+            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
-                        continue;
+                goto out;
-                }
-                ref_item = btrfs_item_ptr(leaf, path->slots[0],
+        if (btrfs_extent_generation(leaf, ei) <=
-                                          struct btrfs_extent_ref);
+            btrfs_root_last_snapshot(&root->root_item))
-                ref_root = btrfs_ref_root(leaf, ref_item);
+                goto out;
-                if ((ref_root != root->root_key.objectid &&
-                     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
+        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-                     objectid != btrfs_ref_objectid(leaf, ref_item)) {
+        if (btrfs_extent_inline_ref_type(leaf, iref) !=
-                        ret = 1;
+            BTRFS_EXTENT_DATA_REF_KEY)
-                        goto out;
+                goto out;
-                }
-                if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
+        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
-                        ret = 1;
+        if (btrfs_extent_refs(leaf, ei) !=
+            btrfs_extent_data_ref_count(leaf, ref) ||
+            btrfs_extent_data_ref_root(leaf, ref) !=
+            root->root_key.objectid ||
+            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+            btrfs_extent_data_ref_offset(leaf, ref) != offset)
+                goto out;
+        ret = 0;
+out:
+        return ret;
+}
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          u64 objectid, u64 offset, u64 bytenr)
+{
+        struct btrfs_path *path;
+        int ret;
+        int ret2;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOENT;
+        do {
+                ret = check_committed_ref(trans, root, path, objectid,
+                                          offset, bytenr);
+                if (ret && ret != -ENOENT)
                        goto out;
-                }
-                path->slots[0]++;
+                ret2 = check_delayed_ref(trans, root, path, objectid,
+                                         offset, bytenr);
+        } while (ret2 == -EAGAIN);
+        if (ret2 && ret2 != -ENOENT) {
+                ret = ret2;
+                goto out;
        }
-        ret = 0;
+        if (ret != -ENOENT || ret2 != -ENOENT)
+                ret = 0;
 out:
        btrfs_free_path(path);
        return ret;
 }
+#if 0
 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                    struct extent_buffer *buf, u32 nr_extents)
 {
@@ -1291,62 +2378,44 @@ static int refsort_cmp(const void *a_void, const void *b_void)
                return 1;
        return 0;
 }
+#endif
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
-noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
-                           struct extent_buffer *orig_buf,
+                           struct extent_buffer *buf,
-                           struct extent_buffer *buf, u32 *nr_extents)
+                           int full_backref, int inc)
 {
        u64 bytenr;
+        u64 num_bytes;
+        u64 parent;
        u64 ref_root;
-        u64 orig_root;
-        u64 ref_generation;
-        u64 orig_generation;
-        struct refsort *sorted;
        u32 nritems;
-        u32 nr_file_extents = 0;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        int i;
        int level;
        int ret = 0;
-        int faili = 0;
-        int refi = 0;
-        int slot;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                            u64, u64, u64, u64, u64, u64, u64, u64, u64);
+                            u64, u64, u64, u64, u64, u64);
        ref_root = btrfs_header_owner(buf);
-        ref_generation = btrfs_header_generation(buf);
-        orig_root = btrfs_header_owner(orig_buf);
-        orig_generation = btrfs_header_generation(orig_buf);
        nritems = btrfs_header_nritems(buf);
        level = btrfs_header_level(buf);
-        sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
+        if (!root->ref_cows && level == 0)
-        BUG_ON(!sorted);
+                return 0;
+        if (inc)
+                process_func = btrfs_inc_extent_ref;
+        else
+                process_func = btrfs_free_extent;
-        if (root->ref_cows) {
+        if (full_backref)
-                process_func = __btrfs_inc_extent_ref;
+                parent = buf->start;
-        } else {
+        else
-                if (level == 0 &&
+                parent = 0;
-                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-                        goto out;
-                if (level != 0 &&
-                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
-                        goto out;
-                process_func = __btrfs_update_extent_ref;
-        }
-        /*
-         * we make two passes through the items.  In the first pass we
-         * only record the byte number and slot.  Then we sort based on
-         * byte number and do the actual work based on the sorted results
-         */
        for (i = 0; i < nritems; i++) {
-                cond_resched();
                if (level == 0) {
                        btrfs_item_key_to_cpu(buf, &key, i);
                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -1360,151 +2429,38 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                        if (bytenr == 0)
                                continue;
-                        nr_file_extents++;
+                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
-                        sorted[refi].bytenr = bytenr;
+                        key.offset -= btrfs_file_extent_offset(buf, fi);
-                        sorted[refi].slot = i;
+                        ret = process_func(trans, root, bytenr, num_bytes,
-                        refi++;
+                                           parent, ref_root, key.objectid,
-                } else {
+                                           key.offset);
-                        bytenr = btrfs_node_blockptr(buf, i);
+                        if (ret)
-                        sorted[refi].bytenr = bytenr;
-                        sorted[refi].slot = i;
-                        refi++;
-                }
-        }
-        /*
-         * if refi == 0, we didn't actually put anything into the sorted
-         * array and we're done
-         */
-        if (refi == 0)
-                goto out;
-        sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-        for (i = 0; i < refi; i++) {
-                cond_resched();
-                slot = sorted[i].slot;
-                bytenr = sorted[i].bytenr;
-                if (level == 0) {
-                        btrfs_item_key_to_cpu(buf, &key, slot);
-                        fi = btrfs_item_ptr(buf, slot,
-                                            struct btrfs_file_extent_item);
-                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-                        if (bytenr == 0)
-                                continue;
-                        ret = process_func(trans, root, bytenr,
-                                   btrfs_file_extent_disk_num_bytes(buf, fi),
-                                   orig_buf->start, buf->start,
-                                   orig_root, ref_root,
-                                   orig_generation, ref_generation,
-                                   key.objectid);
-                        if (ret) {
-                                faili = slot;
-                                WARN_ON(1);
                                goto fail;
-                        }
                } else {
-                        ret = process_func(trans, root, bytenr, buf->len,
+                        bytenr = btrfs_node_blockptr(buf, i);
-                                           orig_buf->start, buf->start,
+                        num_bytes = btrfs_level_size(root, level - 1);
-                                           orig_root, ref_root,
+                        ret = process_func(trans, root, bytenr, num_bytes,
-                                           orig_generation, ref_generation,
+                                           parent, ref_root, level - 1, 0);
-                                           level - 1);
+                        if (ret)
-                        if (ret) {
-                                faili = slot;
-                                WARN_ON(1);
                                goto fail;
-                        }
                }
        }
-out:
-        kfree(sorted);
-        if (nr_extents) {
-                if (level == 0)
-                        *nr_extents = nr_file_extents;
-                else
-                        *nr_extents = nritems;
-        }
        return 0;
 fail:
-        kfree(sorted);
+        BUG();
-        WARN_ON(1);
        return ret;
 }
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                  struct extent_buffer *buf, int full_backref)
-                     struct extent_buffer *buf, int start_slot, int nr)
 {
-        u64 bytenr;
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
-        u64 ref_root;
+}
-        u64 orig_root;
-        u64 ref_generation;
-        u64 orig_generation;
-        struct btrfs_key key;
-        struct btrfs_file_extent_item *fi;
-        int i;
-        int ret;
-        int slot;
-        int level;
-        BUG_ON(start_slot < 0);
-        BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
-        ref_root = btrfs_header_owner(buf);
-        ref_generation = btrfs_header_generation(buf);
-        orig_root = btrfs_header_owner(orig_buf);
-        orig_generation = btrfs_header_generation(orig_buf);
-        level = btrfs_header_level(buf);
-        if (!root->ref_cows) {
-                if (level == 0 &&
-                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-                        return 0;
-                if (level != 0 &&
-                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
-                        return 0;
-        }
-        for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                cond_resched();
+                  struct extent_buffer *buf, int full_backref)
-                if (level == 0) {
+{
-                        btrfs_item_key_to_cpu(buf, &key, slot);
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
-                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                                continue;
-                        fi = btrfs_item_ptr(buf, slot,
-                                            struct btrfs_file_extent_item);
-                        if (btrfs_file_extent_type(buf, fi) ==
-                            BTRFS_FILE_EXTENT_INLINE)
-                                continue;
-                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-                        if (bytenr == 0)
-                                continue;
-                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                    btrfs_file_extent_disk_num_bytes(buf, fi),
-                                    orig_buf->start, buf->start,
-                                    orig_root, ref_root, orig_generation,
-                                    ref_generation, key.objectid);
-                        if (ret)
-                                goto fail;
-                } else {
-                        bytenr = btrfs_node_blockptr(buf, slot);
-                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                            buf->len, orig_buf->start,
-                                            buf->start, orig_root, ref_root,
-                                            orig_generation, ref_generation,
-                                            level - 1);
-                        if (ret)
-                                goto fail;
-                }
-        }
-        return 0;
-fail:
-        WARN_ON(1);
-        return -1;
 }
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -1534,13 +2490,29 @@ fail:
 }
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+                 struct btrfs_block_group_cache *cache)
+{
+        struct rb_node *node;
+        spin_lock(&root->fs_info->block_group_cache_lock);
+        node = rb_next(&cache->cache_node);
+        btrfs_put_block_group(cache);
+        if (node) {
+                cache = rb_entry(node, struct btrfs_block_group_cache,
+                                 cache_node);
+                atomic_inc(&cache->count);
+        } else
+                cache = NULL;
+        spin_unlock(&root->fs_info->block_group_cache_lock);
+        return cache;
+}
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-        struct btrfs_block_group_cache *cache, *entry;
+        struct btrfs_block_group_cache *cache;
-        struct rb_node *n;
        int err = 0;
-        int werr = 0;
        struct btrfs_path *path;
        u64 last = 0;
@@ -1549,39 +2521,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        while (1) {
-                cache = NULL;
+                if (last == 0) {
-                spin_lock(&root->fs_info->block_group_cache_lock);
+                        err = btrfs_run_delayed_refs(trans, root,
-                for (n = rb_first(&root->fs_info->block_group_cache_tree);
+                                                     (unsigned long)-1);
-                     n; n = rb_next(n)) {
+                        BUG_ON(err);
-                        entry = rb_entry(n, struct btrfs_block_group_cache,
-                                         cache_node);
-                        if (entry->dirty) {
-                                cache = entry;
-                                break;
-                        }
                }
-                spin_unlock(&root->fs_info->block_group_cache_lock);
-                if (!cache)
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
-                        break;
+                while (cache) {
+                        if (cache->dirty)
+                                break;
+                        cache = next_block_group(root, cache);
+                }
+                if (!cache) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
                cache->dirty = 0;
-                last += cache->key.offset;
+                last = cache->key.objectid + cache->key.offset;
-                err = write_one_cache_group(trans, root,
+                err = write_one_cache_group(trans, root, path, cache);
-                                            path, cache);
+                BUG_ON(err);
-                /*
+                btrfs_put_block_group(cache);
-                 * if we fail to write the cache group, we want
-                 * to keep it marked dirty in hopes that a later
-                 * write will work
-                 */
-                if (err) {
-                        werr = err;
-                        continue;
-                }
        }
        btrfs_free_path(path);
-        return werr;
+        return 0;
 }
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -1631,6 +2599,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->force_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
+        atomic_set(&found->caching_threads, 0);
        return 0;
 }
@@ -1843,7 +2812,7 @@ again:
                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
                       ", %llu bytes_used, %llu bytes_reserved, "
-                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
                       "%llu total\n", (unsigned long long)bytes,
                       (unsigned long long)data_sinfo->bytes_delalloc,
                       (unsigned long long)data_sinfo->bytes_used,
@@ -2007,6 +2976,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
        u64 old_val;
        u64 byte_in_group;
+        /* block accounting for super block */
+        spin_lock(&info->delalloc_lock);
+        old_val = btrfs_super_bytes_used(&info->super_copy);
+        if (alloc)
+                old_val += num_bytes;
+        else
+                old_val -= num_bytes;
+        btrfs_set_super_bytes_used(&info->super_copy, old_val);
+        /* block accounting for root item */
+        old_val = btrfs_root_used(&root->root_item);
+        if (alloc)
+                old_val += num_bytes;
+        else
+                old_val -= num_bytes;
+        btrfs_set_root_used(&root->root_item, old_val);
+        spin_unlock(&info->delalloc_lock);
        while (total) {
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
@@ -2076,13 +3063,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        if (pin) {
+        if (pin)
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
-        } else {
-                clear_extent_dirty(&fs_info->pinned_extents,
-                                bytenr, bytenr + num - 1, GFP_NOFS);
-        }
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2098,14 +3081,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                        spin_unlock(&cache->space_info->lock);
                        fs_info->total_pinned += len;
                } else {
+                        int unpin = 0;
+                        /*
+                         * in order to not race with the block group caching, we
+                         * only want to unpin the extent if we are cached.  If
+                         * we aren't cached, we want to start async caching this
+                         * block group so we can free the extent the next time
+                         * around.
+                         */
                        spin_lock(&cache->space_info->lock);
                        spin_lock(&cache->lock);
-                        cache->pinned -= len;
+                        unpin = (cache->cached == BTRFS_CACHE_FINISHED);
-                        cache->space_info->bytes_pinned -= len;
+                        if (likely(unpin)) {
+                                cache->pinned -= len;
+                                cache->space_info->bytes_pinned -= len;
+                                fs_info->total_pinned -= len;
+                        }
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                        fs_info->total_pinned -= len;
-                        if (cache->cached)
+                        if (likely(unpin))
+                                clear_extent_dirty(&fs_info->pinned_extents,
+                                                   bytenr, bytenr + len -1,
+                                                   GFP_NOFS);
+                        else
+                                cache_block_group(cache);
+                        if (unpin)
                                btrfs_add_free_space(cache, bytenr, len);
                }
                btrfs_put_block_group(cache);
@@ -2159,6 +3162,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
                                            &start, &end, EXTENT_DIRTY);
                if (ret)
                        break;
                set_extent_dirty(copy, start, end, GFP_NOFS);
                last = end + 1;
        }
@@ -2187,6 +3191,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
        return ret;
 }
@@ -2216,8 +3221,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
                u64 header_owner = btrfs_header_owner(buf);
                u64 header_transid = btrfs_header_generation(buf);
                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-                    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
-                    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
                    header_transid == trans->transid &&
                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                        *must_clean = buf;
@@ -2235,63 +3238,77 @@ pinit:
        return 0;
 }
-/*
- * remove an extent from the root, returns 0 on success
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- */
+                                struct btrfs_root *root,
-static int __free_extent(struct btrfs_trans_handle *trans,
+                                u64 bytenr, u64 num_bytes, u64 parent,
-                         struct btrfs_root *root,
+                                u64 root_objectid, u64 owner_objectid,
-                         u64 bytenr, u64 num_bytes, u64 parent,
+                                u64 owner_offset, int refs_to_drop,
-                         u64 root_objectid, u64 ref_generation,
+                                struct btrfs_delayed_extent_op *extent_op)
-                         u64 owner_objectid, int pin, int mark_free,
-                         int refs_to_drop)
 {
-        struct btrfs_path *path;
        struct btrfs_key key;
+        struct btrfs_path *path;
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_root *extent_root = info->extent_root;
        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *iref;
        int ret;
+        int is_data;
        int extent_slot = 0;
        int found_extent = 0;
        int num_to_del = 1;
-        struct btrfs_extent_item *ei;
+        u32 item_size;
-        u32 refs;
+        u64 refs;
-        key.objectid = bytenr;
-        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-        key.offset = num_bytes;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->reada = 1;
        path->leave_spinning = 1;
-        ret = lookup_extent_backref(trans, extent_root, path,
-                                    bytenr, parent, root_objectid,
+        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
-                                    ref_generation, owner_objectid, 1);
+        BUG_ON(!is_data && refs_to_drop != 1);
+        ret = lookup_extent_backref(trans, extent_root, path, &iref,
+                                    bytenr, num_bytes, parent,
+                                    root_objectid, owner_objectid,
+                                    owner_offset);
        if (ret == 0) {
-                struct btrfs_key found_key;
                extent_slot = path->slots[0];
-                while (extent_slot > 0) {
+                while (extent_slot >= 0) {
-                        extent_slot--;
+                        btrfs_item_key_to_cpu(path->nodes[0], &key,
-                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                              extent_slot);
-                        if (found_key.objectid != bytenr)
+                        if (key.objectid != bytenr)
                                break;
-                        if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
-                            found_key.offset == num_bytes) {
+                            key.offset == num_bytes) {
                                found_extent = 1;
                                break;
                        }
                        if (path->slots[0] - extent_slot > 5)
                                break;
+                        extent_slot--;
                }
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
+                if (found_extent && item_size < sizeof(*ei))
+                        found_extent = 0;
+#endif
                if (!found_extent) {
+                        BUG_ON(iref);
                        ret = remove_extent_backref(trans, extent_root, path,
-                                                    refs_to_drop);
+                                                    NULL, refs_to_drop,
+                                                    is_data);
                        BUG_ON(ret);
                        btrfs_release_path(extent_root, path);
                        path->leave_spinning = 1;
+                        key.objectid = bytenr;
+                        key.type = BTRFS_EXTENT_ITEM_KEY;
+                        key.offset = num_bytes;
                        ret = btrfs_search_slot(trans, extent_root,
                                                &key, path, -1, 1);
                        if (ret) {
@@ -2307,82 +3324,98 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_print_leaf(extent_root, path->nodes[0]);
                WARN_ON(1);
                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-                       "parent %llu root %llu gen %llu owner %llu\n",
+                       "parent %llu root %llu  owner %llu offset %llu\n",
                       (unsigned long long)bytenr,
                       (unsigned long long)parent,
                       (unsigned long long)root_objectid,
-                       (unsigned long long)ref_generation,
+                       (unsigned long long)owner_objectid,
-                       (unsigned long long)owner_objectid);
+                       (unsigned long long)owner_offset);
        }
        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, extent_slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                BUG_ON(found_extent || extent_slot != path->slots[0]);
+                ret = convert_extent_item_v0(trans, extent_root, path,
+                                             owner_objectid, 0);
+                BUG_ON(ret < 0);
+                btrfs_release_path(extent_root, path);
+                path->leave_spinning = 1;
+                key.objectid = bytenr;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = num_bytes;
+                ret = btrfs_search_slot(trans, extent_root, &key, path,
+                                        -1, 1);
+                if (ret) {
+                        printk(KERN_ERR "umm, got %d back from search"
+                               ", was looking for %llu\n", ret,
+                               (unsigned long long)bytenr);
+                        btrfs_print_leaf(extent_root, path->nodes[0]);
+                }
+                BUG_ON(ret);
+                extent_slot = path->slots[0];
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, extent_slot);
+        }
+#endif
+        BUG_ON(item_size < sizeof(*ei));
        ei = btrfs_item_ptr(leaf, extent_slot,
                            struct btrfs_extent_item);
-        refs = btrfs_extent_refs(leaf, ei);
+        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                struct btrfs_tree_block_info *bi;
-        /*
+                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
-         * we're not allowed to delete the extent item if there
+                bi = (struct btrfs_tree_block_info *)(ei + 1);
-         * are other delayed ref updates pending
+                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
-         */
+        }
+        refs = btrfs_extent_refs(leaf, ei);
        BUG_ON(refs < refs_to_drop);
        refs -= refs_to_drop;
-        btrfs_set_extent_refs(leaf, ei, refs);
-        btrfs_mark_buffer_dirty(leaf);
-        if (refs == 0 && found_extent &&
+        if (refs > 0) {
-            path->slots[0] == extent_slot + 1) {
+                if (extent_op)
-                struct btrfs_extent_ref *ref;
+                        __run_delayed_extent_op(extent_op, leaf, ei);
-                ref = btrfs_item_ptr(leaf, path->slots[0],
+                /*
-                                     struct btrfs_extent_ref);
+                 * In the case of inline back ref, reference count will
-                BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
+                 * be updated by remove_extent_backref
-                /* if the back ref and the extent are next to each other
-                 * they get deleted below in one shot
                 */
-                path->slots[0] = extent_slot;
+                if (iref) {
-                num_to_del = 2;
+                        BUG_ON(!found_extent);
-        } else if (found_extent) {
+                } else {
-                /* otherwise delete the extent back ref */
+                        btrfs_set_extent_refs(leaf, ei, refs);
-                ret = remove_extent_backref(trans, extent_root, path,
+                        btrfs_mark_buffer_dirty(leaf);
-                                            refs_to_drop);
+                }
-                BUG_ON(ret);
+                if (found_extent) {
-                /* if refs are 0, we need to setup the path for deletion */
+                        ret = remove_extent_backref(trans, extent_root, path,
-                if (refs == 0) {
+                                                    iref, refs_to_drop,
-                        btrfs_release_path(extent_root, path);
+                                                    is_data);
-                        path->leave_spinning = 1;
-                        ret = btrfs_search_slot(trans, extent_root, &key, path,
-                                                -1, 1);
                        BUG_ON(ret);
                }
-        }
+        } else {
+                int mark_free = 0;
-        if (refs == 0) {
-                u64 super_used;
-                u64 root_used;
                struct extent_buffer *must_clean = NULL;
-                if (pin) {
+                if (found_extent) {
-                        ret = pin_down_bytes(trans, root, path,
+                        BUG_ON(is_data && refs_to_drop !=
-                                bytenr, num_bytes,
+                               extent_data_ref_count(root, path, iref));
-                                owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+                        if (iref) {
-                                &must_clean);
+                                BUG_ON(path->slots[0] != extent_slot);
-                        if (ret > 0)
+                        } else {
-                                mark_free = 1;
+                                BUG_ON(path->slots[0] != extent_slot + 1);
-                        BUG_ON(ret < 0);
+                                path->slots[0] = extent_slot;
+                                num_to_del = 2;
+                        }
                }
-                /* block accounting for super block */
+                ret = pin_down_bytes(trans, root, path, bytenr,
-                spin_lock(&info->delalloc_lock);
+                                     num_bytes, is_data, &must_clean);
-                super_used = btrfs_super_bytes_used(&info->super_copy);
+                if (ret > 0)
-                btrfs_set_super_bytes_used(&info->super_copy,
+                        mark_free = 1;
-                                           super_used - num_bytes);
+                BUG_ON(ret < 0);
-                /* block accounting for root item */
-                root_used = btrfs_root_used(&root->root_item);
-                btrfs_set_root_used(&root->root_item,
-                                           root_used - num_bytes);
-                spin_unlock(&info->delalloc_lock);
                /*
                 * it is going to be very rare for someone to be waiting
                 * on the block we're freeing.  del_items might need to
@@ -2403,7 +3436,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                        free_extent_buffer(must_clean);
                }
-                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
                } else {
@@ -2421,34 +3454,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 }
 /*
- * remove an extent from the root, returns 0 on success
- */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        u64 bytenr, u64 num_bytes, u64 parent,
-                                        u64 root_objectid, u64 ref_generation,
-                                        u64 owner_objectid, int pin,
-                                        int refs_to_drop)
-{
-        WARN_ON(num_bytes < root->sectorsize);
-        /*
-         * if metadata always pin
-         * if data pin when any transaction has committed this
-         */
-        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
-            ref_generation != trans->transid)
-                pin = 1;
-        if (ref_generation != trans->transid)
-                pin = 1;
-        return __free_extent(trans, root, bytenr, num_bytes, parent,
-                            root_objectid, ref_generation,
-                            owner_objectid, pin, pin == 0, refs_to_drop);
-}
-/*
 * when we free an extent, it is possible (and likely) that we free the last
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
@@ -2479,6 +3484,13 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        if (ref->bytenr == bytenr)
                goto out;
+        if (head->extent_op) {
+                if (!head->must_insert_reserved)
+                        goto out;
+                kfree(head->extent_op);
+                head->extent_op = NULL;
+        }
        /*
         * waiting for the lock here would deadlock.  If someone else has it
         * locked they are already in the process of dropping it anyway
@@ -2507,7 +3519,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        spin_unlock(&delayed_refs->lock);
        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-                                  &head->node, head->must_insert_reserved);
+                                  &head->node, head->extent_op,
+                                  head->must_insert_reserved);
        BUG_ON(ret);
        btrfs_put_delayed_ref(&head->node);
        return 0;
@@ -2519,32 +3532,32 @@ out:
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
-                      u64 root_objectid, u64 ref_generation,
+                      u64 root_objectid, u64 owner, u64 offset)
-                      u64 owner_objectid, int pin)
 {
        int ret;
        /*
         * tree log blocks never actually go into the extent allocation
         * tree, just update pinning info and exit early.
-         *
-         * data extents referenced by the tree log do need to have
-         * their reference counts bumped.
         */
-        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
-            owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
                /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
                update_reserved_extents(root, bytenr, num_bytes, 0);
                ret = 0;
-        } else {
+        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-                ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
-                                       root_objectid, ref_generation,
+                                        parent, root_objectid, (int)owner,
-                                       owner_objectid,
+                                        BTRFS_DROP_DELAYED_REF, NULL);
-                                       BTRFS_DROP_DELAYED_REF, 1);
                BUG_ON(ret);
                ret = check_ref_cleanup(trans, root, bytenr);
                BUG_ON(ret);
+        } else {
+                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+                                        parent, root_objectid, owner,
+                                        offset, BTRFS_DROP_DELAYED_REF, NULL);
+                BUG_ON(ret);
        }
        return ret;
 }
@@ -2557,6 +3570,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
 }
 /*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once.  So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes.  Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+                                u64 num_bytes)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+        if (block_group_cache_done(cache)) {
+                finish_wait(&cache->caching_q, &wait);
+                return 0;
+        }
+        schedule();
+        finish_wait(&cache->caching_q, &wait);
+        wait_event(cache->caching_q, block_group_cache_done(cache) ||
+                   (cache->free_space >= num_bytes));
+        return 0;
+}
+enum btrfs_loop_type {
+        LOOP_CACHED_ONLY = 0,
+        LOOP_CACHING_NOWAIT = 1,
+        LOOP_CACHING_WAIT = 2,
+        LOOP_ALLOC_CHUNK = 3,
+        LOOP_NO_EMPTY_SIZE = 4,
+};
+/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
 * ins->objectid == block start
@@ -2581,6 +3633,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+        bool found_uncached_bg = false;
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -2612,15 +3665,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
-        if (!last_ptr) {
+        if (!last_ptr)
                empty_cluster = 0;
-                loop = 1;
-        }
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
-                if (block_group && block_group_bits(block_group, data)) {
+                /*
+                 * we don't want to use the block group if it doesn't match our
+                 * allocation bits, or if its not cached.
+                 */
+                if (block_group && block_group_bits(block_group, data) &&
+                    block_group_cache_done(block_group)) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
@@ -2643,21 +3699,35 @@ search:
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups, list) {
                u64 offset;
+                int cached;
                atomic_inc(&block_group->count);
                search_start = block_group->key.objectid;
 have_block_group:
-                if (unlikely(!block_group->cached)) {
+                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
-                        mutex_lock(&block_group->cache_mutex);
+                        /*
-                        ret = cache_block_group(root, block_group);
+                         * we want to start caching kthreads, but not too many
-                        mutex_unlock(&block_group->cache_mutex);
+                         * right off the bat so we don't overwhelm the system,
-                        if (ret) {
+                         * so only start them if there are less than 2 and we're
-                                btrfs_put_block_group(block_group);
+                         * in the initial allocation phase.
-                                break;
+                         */
+                        if (loop > LOOP_CACHING_NOWAIT ||
+                            atomic_read(&space_info->caching_threads) < 2) {
+                                ret = cache_block_group(block_group);
+                                BUG_ON(ret);
                        }
                }
+                cached = block_group_cache_done(block_group);
+                if (unlikely(!cached)) {
+                        found_uncached_bg = true;
+                        /* if we only want cached bgs, loop */
+                        if (loop == LOOP_CACHED_ONLY)
+                                goto loop;
+                }
                if (unlikely(block_group->ro))
                        goto loop;
@@ -2719,7 +3789,7 @@ refill_cluster:
                        last_ptr_loop = 0;
                        /* allocate a cluster in this block group */
-                        ret = btrfs_find_space_cluster(trans,
+                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
                                               offset, num_bytes,
                                               empty_cluster + empty_size);
@@ -2736,14 +3806,21 @@ refill_cluster:
                                        spin_unlock(&last_ptr->refill_lock);
                                        goto checks;
                                }
+                        } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+                                spin_unlock(&last_ptr->refill_lock);
+                                wait_block_group_cache_progress(block_group,
+                                       num_bytes + empty_cluster + empty_size);
+                                goto have_block_group;
                        }
                        /*
                         * at this point we either didn't find a cluster
                         * or we weren't able to allocate a block from our
                         * cluster.  Free the cluster we've been trying
                         * to use, and go to the next block group
                         */
-                        if (loop < 2) {
+                        if (loop < LOOP_NO_EMPTY_SIZE) {
                                btrfs_return_cluster_to_free_space(NULL,
                                                                   last_ptr);
                                spin_unlock(&last_ptr->refill_lock);
@@ -2754,11 +3831,17 @@ refill_cluster:
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
-                if (!offset)
+                if (!offset && (cached || (!cached &&
+                                           loop == LOOP_CACHING_NOWAIT))) {
                        goto loop;
+                } else if (!offset && (!cached &&
+                                       loop > LOOP_CACHING_NOWAIT)) {
+                        wait_block_group_cache_progress(block_group,
+                                        num_bytes + empty_size);
+                        goto have_block_group;
+                }
 checks:
                search_start = stripe_align(root, offset);
                /* move on to the next group */
                if (search_start + num_bytes >= search_end) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
@@ -2804,13 +3887,26 @@ loop:
        }
        up_read(&space_info->groups_sem);
-        /* loop == 0, try to find a clustered alloc in every block group
+        /* LOOP_CACHED_ONLY, only search fully cached block groups
-         * loop == 1, try again after forcing a chunk allocation
+         * LOOP_CACHING_NOWAIT, search partially cached block groups, but
-         * loop == 2, set empty_size and empty_cluster to 0 and try again
+         *                      dont wait foR them to finish caching
+         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+         *                      again
         */
-        if (!ins->objectid && loop < 3 &&
+        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
-            (empty_size || empty_cluster || allowed_chunk_alloc)) {
+            (found_uncached_bg || empty_size || empty_cluster ||
-                if (loop >= 2) {
+             allowed_chunk_alloc)) {
+                if (found_uncached_bg) {
+                        found_uncached_bg = false;
+                        if (loop < LOOP_CACHING_WAIT) {
+                                loop++;
+                                goto search;
+                        }
+                }
+                if (loop == LOOP_ALLOC_CHUNK) {
                        empty_size = 0;
                        empty_cluster = 0;
                }
@@ -2823,7 +3919,7 @@ loop:
                        space_info->force_alloc = 1;
                }
-                if (loop < 3) {
+                if (loop < LOOP_NO_EMPTY_SIZE) {
                        loop++;
                        goto search;
                }
@@ -2919,7 +4015,7 @@ again:
                               num_bytes, data, 1);
                goto again;
        }
-        if (ret) {
+        if (ret == -ENOSPC) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -2927,7 +4023,6 @@ again:
                       "wanted %llu\n", (unsigned long long)data,
                       (unsigned long long)num_bytes);
                dump_space_info(sinfo, num_bytes);
-                BUG();
        }
        return ret;
@@ -2965,103 +4060,153 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
                                     empty_size, hint_byte, search_end, ins,
                                     data);
-        update_reserved_extents(root, ins->objectid, ins->offset, 1);
+        if (!ret)
+                update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root, u64 parent,
+                                      struct btrfs_root *root,
-                                         u64 root_objectid, u64 ref_generation,
+                                      u64 parent, u64 root_objectid,
-                                         u64 owner, struct btrfs_key *ins,
+                                      u64 flags, u64 owner, u64 offset,
-                                         int ref_mod)
+                                      struct btrfs_key *ins, int ref_mod)
 {
        int ret;
-        u64 super_used;
+        struct btrfs_fs_info *fs_info = root->fs_info;
-        u64 root_used;
-        u64 num_bytes = ins->offset;
-        u32 sizes[2];
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_root *extent_root = info->extent_root;
        struct btrfs_extent_item *extent_item;
-        struct btrfs_extent_ref *ref;
+        struct btrfs_extent_inline_ref *iref;
        struct btrfs_path *path;
-        struct btrfs_key keys[2];
+        struct extent_buffer *leaf;
+        int type;
-        if (parent == 0)
+        u32 size;
-                parent = ins->objectid;
-        /* block accounting for super block */
-        spin_lock(&info->delalloc_lock);
-        super_used = btrfs_super_bytes_used(&info->super_copy);
-        btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
-        /* block accounting for root item */
+        if (parent > 0)
-        root_used = btrfs_root_used(&root->root_item);
+                type = BTRFS_SHARED_DATA_REF_KEY;
-        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
+        else
-        spin_unlock(&info->delalloc_lock);
+                type = BTRFS_EXTENT_DATA_REF_KEY;
-        memcpy(&keys[0], ins, sizeof(*ins));
+        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
-        keys[1].objectid = ins->objectid;
-        keys[1].type = BTRFS_EXTENT_REF_KEY;
-        keys[1].offset = parent;
-        sizes[0] = sizeof(*extent_item);
-        sizes[1] = sizeof(*ref);
        path = btrfs_alloc_path();
        BUG_ON(!path);
        path->leave_spinning = 1;
-        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
+        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-                                       sizes, 2);
+                                      ins, size);
        BUG_ON(ret);
-        extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+        leaf = path->nodes[0];
+        extent_item = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_item);
-        btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
+        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
-        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
-                             struct btrfs_extent_ref);
+        btrfs_set_extent_flags(leaf, extent_item,
+                               flags | BTRFS_EXTENT_FLAG_DATA);
-        btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
-        btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
+        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
-        btrfs_set_ref_objectid(path->nodes[0], ref, owner);
+        btrfs_set_extent_inline_ref_type(leaf, iref, type);
-        btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
+        if (parent > 0) {
+                struct btrfs_shared_data_ref *ref;
+                ref = (struct btrfs_shared_data_ref *)(iref + 1);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+        } else {
+                struct btrfs_extent_data_ref *ref;
+                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        trans->alloc_exclude_start = 0;
-        trans->alloc_exclude_nr = 0;
        btrfs_free_path(path);
-        if (ret)
+        ret = update_block_group(trans, root, ins->objectid, ins->offset,
-                goto out;
+                                 1, 0);
-        ret = update_block_group(trans, root, ins->objectid,
-                                 ins->offset, 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
                       (unsigned long long)ins->offset);
                BUG();
        }
-out:
        return ret;
 }
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, u64 parent,
+                                     struct btrfs_root *root,
-                                u64 root_objectid, u64 ref_generation,
+                                     u64 parent, u64 root_objectid,
-                                u64 owner, struct btrfs_key *ins)
+                                     u64 flags, struct btrfs_disk_key *key,
+                                     int level, struct btrfs_key *ins)
 {
        int ret;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_extent_item *extent_item;
+        struct btrfs_tree_block_info *block_info;
+        struct btrfs_extent_inline_ref *iref;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
-        if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
+        path = btrfs_alloc_path();
-                return 0;
+        BUG_ON(!path);
-        ret = btrfs_add_delayed_ref(trans, ins->objectid,
+        path->leave_spinning = 1;
-                                    ins->offset, parent, root_objectid,
+        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-                                    ref_generation, owner,
+                                      ins, size);
-                                    BTRFS_ADD_DELAYED_EXTENT, 0);
        BUG_ON(ret);
+        leaf = path->nodes[0];
+        extent_item = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_item);
+        btrfs_set_extent_refs(leaf, extent_item, 1);
+        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+        btrfs_set_extent_flags(leaf, extent_item,
+                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+        block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+        btrfs_set_tree_block_key(leaf, block_info, key);
+        btrfs_set_tree_block_level(leaf, block_info, level);
+        iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+        if (parent > 0) {
+                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+                btrfs_set_extent_inline_ref_type(leaf, iref,
+                                                 BTRFS_SHARED_BLOCK_REF_KEY);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+        } else {
+                btrfs_set_extent_inline_ref_type(leaf, iref,
+                                                 BTRFS_TREE_BLOCK_REF_KEY);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+                                 1, 0);
+        if (ret) {
+                printk(KERN_ERR "btrfs update block group failed for %llu "
+                       "%llu\n", (unsigned long long)ins->objectid,
+                       (unsigned long long)ins->offset);
+                BUG();
+        }
+        return ret;
+}
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     u64 root_objectid, u64 owner,
+                                     u64 offset, struct btrfs_key *ins)
+{
+        int ret;
+        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+        ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
+                                         0, root_objectid, owner, offset,
+                                         BTRFS_ADD_DELAYED_EXTENT, NULL);
        return ret;
 }
@@ -3070,25 +4215,25 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, u64 parent,
+                                   struct btrfs_root *root,
-                                u64 root_objectid, u64 ref_generation,
+                                   u64 root_objectid, u64 owner, u64 offset,
-                                u64 owner, struct btrfs_key *ins)
+                                   struct btrfs_key *ins)
 {
        int ret;
        struct btrfs_block_group_cache *block_group;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        mutex_lock(&block_group->cache_mutex);
+        cache_block_group(block_group);
-        cache_block_group(root, block_group);
+        wait_event(block_group->caching_q,
-        mutex_unlock(&block_group->cache_mutex);
+                   block_group_cache_done(block_group));
        ret = btrfs_remove_free_space(block_group, ins->objectid,
                                      ins->offset);
        BUG_ON(ret);
        btrfs_put_block_group(block_group);
-        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
-                                            ref_generation, owner, ins, 1);
+                                         0, owner, offset, ins, 1);
        return ret;
 }
@@ -3099,26 +4244,49 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 *
 * returns 0 if everything worked, non-zero otherwise.
 */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+static int alloc_tree_block(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
+                            struct btrfs_root *root,
-                       u64 num_bytes, u64 parent, u64 min_alloc_size,
+                            u64 num_bytes, u64 parent, u64 root_objectid,
-                       u64 root_objectid, u64 ref_generation,
+                            struct btrfs_disk_key *key, int level,
-                       u64 owner_objectid, u64 empty_size, u64 hint_byte,
+                            u64 empty_size, u64 hint_byte, u64 search_end,
-                       u64 search_end, struct btrfs_key *ins, u64 data)
+                            struct btrfs_key *ins)
 {
        int ret;
-        ret = __btrfs_reserve_extent(trans, root, num_bytes,
+        u64 flags = 0;
-                                     min_alloc_size, empty_size, hint_byte,
-                                     search_end, ins, data);
+        ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-        BUG_ON(ret);
+                                     empty_size, hint_byte, search_end,
+                                     ins, 0);
+        if (ret)
+                return ret;
+        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent == 0)
+                        parent = ins->objectid;
+                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        } else
+                BUG_ON(parent > 0);
+        update_reserved_extents(root, ins->objectid, ins->offset, 1);
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                ret = btrfs_add_delayed_ref(trans, ins->objectid,
+                struct btrfs_delayed_extent_op *extent_op;
-                                            ins->offset, parent, root_objectid,
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                                            ref_generation, owner_objectid,
+                BUG_ON(!extent_op);
-                                            BTRFS_ADD_DELAYED_EXTENT, 0);
+                if (key)
+                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
+                else
+                        memset(&extent_op->key, 0, sizeof(extent_op->key));
+                extent_op->flags_to_set = flags;
+                extent_op->update_key = 1;
+                extent_op->update_flags = 1;
+                extent_op->is_data = 0;
+                ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
+                                        ins->offset, parent, root_objectid,
+                                        level, BTRFS_ADD_DELAYED_EXTENT,
+                                        extent_op);
                BUG_ON(ret);
        }
-        update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
@@ -3157,21 +4325,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 * returns the tree buffer or NULL.
 */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                             struct btrfs_root *root,
+                                        struct btrfs_root *root, u32 blocksize,
-                                             u32 blocksize, u64 parent,
+                                        u64 parent, u64 root_objectid,
-                                             u64 root_objectid,
+                                        struct btrfs_disk_key *key, int level,
-                                             u64 ref_generation,
+                                        u64 hint, u64 empty_size)
-                                             int level,
-                                             u64 hint,
-                                             u64 empty_size)
 {
        struct btrfs_key ins;
        int ret;
        struct extent_buffer *buf;
-        ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+        ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
-                                 root_objectid, ref_generation, level,
+                               key, level, empty_size, hint, (u64)-1, &ins);
-                                 empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
                BUG_ON(ret > 0);
                return ERR_PTR(ret);
@@ -3182,35 +4346,23 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        return buf;
 }
+#if 0
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf)
 {
-        u64 leaf_owner;
+        u64 disk_bytenr;
-        u64 leaf_generation;
+        u64 num_bytes;
-        struct refsort *sorted;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
+        u32 nritems;
        int i;
-        int nritems;
        int ret;
-        int refi = 0;
-        int slot;
        BUG_ON(!btrfs_is_leaf(leaf));
        nritems = btrfs_header_nritems(leaf);
-        leaf_owner = btrfs_header_owner(leaf);
-        leaf_generation = btrfs_header_generation(leaf);
-        sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
-        /* we do this loop twice.  The first time we build a list
-         * of the extents we have a reference on, then we sort the list
-         * by bytenr.  The second time around we actually do the
-         * extent freeing.
-         */
        for (i = 0; i < nritems; i++) {
-                u64 disk_bytenr;
                cond_resched();
                btrfs_item_key_to_cpu(leaf, &key, i);
                /* only extents have references, skip everything else */
@@ -3230,42 +4382,11 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                if (disk_bytenr == 0)
                        continue;
-                sorted[refi].bytenr = disk_bytenr;
+                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-                sorted[refi].slot = i;
+                ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
-                refi++;
+                                        leaf->start, 0, key.objectid, 0);
-        }
-        if (refi == 0)
-                goto out;
-        sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-        for (i = 0; i < refi; i++) {
-                u64 disk_bytenr;
-                disk_bytenr = sorted[i].bytenr;
-                slot = sorted[i].slot;
-                cond_resched();
-                btrfs_item_key_to_cpu(leaf, &key, slot);
-                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-                ret = btrfs_free_extent(trans, root, disk_bytenr,
-                                btrfs_file_extent_disk_num_bytes(leaf, fi),
-                                leaf->start, leaf_owner, leaf_generation,
-                                key.objectid, 0);
                BUG_ON(ret);
-                atomic_inc(&root->fs_info->throttle_gen);
-                wake_up(&root->fs_info->transaction_throttle);
-                cond_resched();
        }
-out:
-        kfree(sorted);
        return 0;
 }
@@ -3311,13 +4432,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, u64 start,
                                     u64 len, u32 *refs)
 {
        int ret;
-        ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
+        ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
        BUG_ON(ret);
 #if 0 /* some debugging code in case we see problems here */
@@ -3352,6 +4474,7 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
        return ret;
 }
 /*
 * this is used while deleting old snapshots, and it drops the refs
 * on a whole subtree starting from a level 1 node.
@@ -3645,279 +4768,473 @@ out:
        cond_resched();
        return 0;
 }
+#endif
+struct walk_control {
+        u64 refs[BTRFS_MAX_LEVEL];
+        u64 flags[BTRFS_MAX_LEVEL];
+        struct btrfs_key update_progress;
+        int stage;
+        int level;
+        int shared_level;
+        int update_ref;
+        int keep_locks;
+};
+#define DROP_REFERENCE  1
+#define UPDATE_BACKREF  2
 /*
- * helper function for drop_subtree, this function is similar to
+ * hepler to process tree block while walking down the tree.
- * walk_down_tree. The main difference is that it checks reference
+ *
- * counts while tree blocks are locked.
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block. if the block is shared and
+ * we need update back refs for the subtree rooted at the
+ * block, this function changes wc->stage to UPDATE_BACKREF
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
 */
-static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
-                                      struct btrfs_root *root,
+                                   struct btrfs_root *root,
-                                      struct btrfs_path *path, int *level)
+                                   struct btrfs_path *path,
+                                   struct walk_control *wc)
+{
+        int level = wc->level;
+        struct extent_buffer *eb = path->nodes[level];
+        struct btrfs_key key;
+        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        int ret;
+        if (wc->stage == UPDATE_BACKREF &&
+            btrfs_header_owner(eb) != root->root_key.objectid)
+                return 1;
+        /*
+         * when reference count of tree block is 1, it won't increase
+         * again. once full backref flag is set, we never clear it.
+         */
+        if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+            (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+                BUG_ON(!path->locks[level]);
+                ret = btrfs_lookup_extent_info(trans, root,
+                                               eb->start, eb->len,
+                                               &wc->refs[level],
+                                               &wc->flags[level]);
+                BUG_ON(ret);
+                BUG_ON(wc->refs[level] == 0);
+        }
+        if (wc->stage == DROP_REFERENCE &&
+            wc->update_ref && wc->refs[level] > 1) {
+                BUG_ON(eb == root->node);
+                BUG_ON(path->slots[level] > 0);
+                if (level == 0)
+                        btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
+                else
+                        btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
+                if (btrfs_header_owner(eb) == root->root_key.objectid &&
+                    btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
+                        wc->stage = UPDATE_BACKREF;
+                        wc->shared_level = level;
+                }
+        }
+        if (wc->stage == DROP_REFERENCE) {
+                if (wc->refs[level] > 1)
+                        return 1;
+                if (path->locks[level] && !wc->keep_locks) {
+                        btrfs_tree_unlock(eb);
+                        path->locks[level] = 0;
+                }
+                return 0;
+        }
+        /* wc->stage == UPDATE_BACKREF */
+        if (!(wc->flags[level] & flag)) {
+                BUG_ON(!path->locks[level]);
+                ret = btrfs_inc_ref(trans, root, eb, 1);
+                BUG_ON(ret);
+                ret = btrfs_dec_ref(trans, root, eb, 0);
+                BUG_ON(ret);
+                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
+                                                  eb->len, flag, 0);
+                BUG_ON(ret);
+                wc->flags[level] |= flag;
+        }
+        /*
+         * the block is shared by multiple trees, so it's not good to
+         * keep the tree lock
+         */
+        if (path->locks[level] && level > 0) {
+                btrfs_tree_unlock(eb);
+                path->locks[level] = 0;
+        }
+        return 0;
+}
+/*
+ * hepler to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
+ */
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct walk_control *wc)
+{
+        int ret = 0;
+        int level = wc->level;
+        struct extent_buffer *eb = path->nodes[level];
+        u64 parent = 0;
+        if (wc->stage == UPDATE_BACKREF) {
+                BUG_ON(wc->shared_level < level);
+                if (level < wc->shared_level)
+                        goto out;
+                BUG_ON(wc->refs[level] <= 1);
+                ret = find_next_key(path, level + 1, &wc->update_progress);
+                if (ret > 0)
+                        wc->update_ref = 0;
+                wc->stage = DROP_REFERENCE;
+                wc->shared_level = -1;
+                path->slots[level] = 0;
+                /*
+                 * check reference count again if the block isn't locked.
+                 * we should start walking down the tree again if reference
+                 * count is one.
+                 */
+                if (!path->locks[level]) {
+                        BUG_ON(level == 0);
+                        btrfs_tree_lock(eb);
+                        btrfs_set_lock_blocking(eb);
+                        path->locks[level] = 1;
+                        ret = btrfs_lookup_extent_info(trans, root,
+                                                       eb->start, eb->len,
+                                                       &wc->refs[level],
+                                                       &wc->flags[level]);
+                        BUG_ON(ret);
+                        BUG_ON(wc->refs[level] == 0);
+                        if (wc->refs[level] == 1) {
+                                btrfs_tree_unlock(eb);
+                                path->locks[level] = 0;
+                                return 1;
+                        }
+                } else {
+                        BUG_ON(level != 0);
+                }
+        }
+        /* wc->stage == DROP_REFERENCE */
+        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+        if (wc->refs[level] == 1) {
+                if (level == 0) {
+                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                                ret = btrfs_dec_ref(trans, root, eb, 1);
+                        else
+                                ret = btrfs_dec_ref(trans, root, eb, 0);
+                        BUG_ON(ret);
+                }
+                /* make block locked assertion in clean_tree_block happy */
+                if (!path->locks[level] &&
+                    btrfs_header_generation(eb) == trans->transid) {
+                        btrfs_tree_lock(eb);
+                        btrfs_set_lock_blocking(eb);
+                        path->locks[level] = 1;
+                }
+                clean_tree_block(trans, root, eb);
+        }
+        if (eb == root->node) {
+                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                        parent = eb->start;
+                else
+                        BUG_ON(root->root_key.objectid !=
+                               btrfs_header_owner(eb));
+        } else {
+                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                        parent = path->nodes[level + 1]->start;
+                else
+                        BUG_ON(root->root_key.objectid !=
+                               btrfs_header_owner(path->nodes[level + 1]));
+        }
+        ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
+                                root->root_key.objectid, level, 0);
+        BUG_ON(ret);
+out:
+        wc->refs[level] = 0;
+        wc->flags[level] = 0;
+        return ret;
+}
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path,
+                                   struct walk_control *wc)
 {
        struct extent_buffer *next;
        struct extent_buffer *cur;
-        struct extent_buffer *parent;
        u64 bytenr;
        u64 ptr_gen;
        u32 blocksize;
-        u32 refs;
+        int level = wc->level;
        int ret;
-        cur = path->nodes[*level];
+        while (level >= 0) {
-        ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+                cur = path->nodes[level];
-                                      &refs);
+                BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
-        BUG_ON(ret);
-        if (refs > 1)
-                goto out;
-        while (*level >= 0) {
+                ret = walk_down_proc(trans, root, path, wc);
-                cur = path->nodes[*level];
+                if (ret > 0)
-                if (*level == 0) {
-                        ret = btrfs_drop_leaf_ref(trans, root, cur);
-                        BUG_ON(ret);
-                        clean_tree_block(trans, root, cur);
                        break;
-                }
-                if (path->slots[*level] >= btrfs_header_nritems(cur)) {
+                if (level == 0)
-                        clean_tree_block(trans, root, cur);
                        break;
-                }
-                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+                bytenr = btrfs_node_blockptr(cur, path->slots[level]);
-                blocksize = btrfs_level_size(root, *level - 1);
+                blocksize = btrfs_level_size(root, level - 1);
-                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
                btrfs_tree_lock(next);
                btrfs_set_lock_blocking(next);
-                ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+                level--;
-                                              &refs);
+                BUG_ON(level != btrfs_header_level(next));
-                BUG_ON(ret);
+                path->nodes[level] = next;
-                if (refs > 1) {
+                path->slots[level] = 0;
-                        parent = path->nodes[*level];
+                path->locks[level] = 1;
-                        ret = btrfs_free_extent(trans, root, bytenr,
+                wc->level = level;
-                                        blocksize, parent->start,
-                                        btrfs_header_owner(parent),
-                                        btrfs_header_generation(parent),
-                                        *level - 1, 1);
-                        BUG_ON(ret);
-                        path->slots[*level]++;
-                        btrfs_tree_unlock(next);
-                        free_extent_buffer(next);
-                        continue;
-                }
-                *level = btrfs_header_level(next);
-                path->nodes[*level] = next;
-                path->slots[*level] = 0;
-                path->locks[*level] = 1;
-                cond_resched();
-        }
-out:
-        parent = path->nodes[*level + 1];
-        bytenr = path->nodes[*level]->start;
-        blocksize = path->nodes[*level]->len;
-        ret = btrfs_free_extent(trans, root, bytenr, blocksize,
-                        parent->start, btrfs_header_owner(parent),
-                        btrfs_header_generation(parent), *level, 1);
-        BUG_ON(ret);
-        if (path->locks[*level]) {
-                btrfs_tree_unlock(path->nodes[*level]);
-                path->locks[*level] = 0;
        }
-        free_extent_buffer(path->nodes[*level]);
-        path->nodes[*level] = NULL;
-        *level += 1;
-        cond_resched();
        return 0;
 }
-/*
- * helper for dropping snapshots.  This walks back up the tree in the path
- * to find the first node higher up where we haven't yet gone through
- * all the slots
- */
 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
-                                 int *level, int max_level)
+                                 struct walk_control *wc, int max_level)
 {
-        u64 root_owner;
+        int level = wc->level;
-        u64 root_gen;
-        struct btrfs_root_item *root_item = &root->root_item;
-        int i;
-        int slot;
        int ret;
-        for (i = *level; i < max_level && path->nodes[i]; i++) {
+        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
-                slot = path->slots[i];
+        while (level < max_level && path->nodes[level]) {
-                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                wc->level = level;
-                        struct extent_buffer *node;
+                if (path->slots[level] + 1 <
-                        struct btrfs_disk_key disk_key;
+                    btrfs_header_nritems(path->nodes[level])) {
+                        path->slots[level]++;
-                        /*
-                         * there is more work to do in this level.
-                         * Update the drop_progress marker to reflect
-                         * the work we've done so far, and then bump
-                         * the slot number
-                         */
-                        node = path->nodes[i];
-                        path->slots[i]++;
-                        *level = i;
-                        WARN_ON(*level == 0);
-                        btrfs_node_key(node, &disk_key, path->slots[i]);
-                        memcpy(&root_item->drop_progress,
-                               &disk_key, sizeof(disk_key));
-                        root_item->drop_level = i;
                        return 0;
                } else {
-                        struct extent_buffer *parent;
+                        ret = walk_up_proc(trans, root, path, wc);
+                        if (ret > 0)
-                        /*
+                                return 0;
-                         * this whole node is done, free our reference
-                         * on it and go up one level
-                         */
-                        if (path->nodes[*level] == root->node)
-                                parent = path->nodes[*level];
-                        else
-                                parent = path->nodes[*level + 1];
-                        root_owner = btrfs_header_owner(parent);
-                        root_gen = btrfs_header_generation(parent);
-                        clean_tree_block(trans, root, path->nodes[*level]);
+                        if (path->locks[level]) {
-                        ret = btrfs_free_extent(trans, root,
+                                btrfs_tree_unlock(path->nodes[level]);
-                                                path->nodes[*level]->start,
+                                path->locks[level] = 0;
-                                                path->nodes[*level]->len,
-                                                parent->start, root_owner,
-                                                root_gen, *level, 1);
-                        BUG_ON(ret);
-                        if (path->locks[*level]) {
-                                btrfs_tree_unlock(path->nodes[*level]);
-                                path->locks[*level] = 0;
                        }
-                        free_extent_buffer(path->nodes[*level]);
+                        free_extent_buffer(path->nodes[level]);
-                        path->nodes[*level] = NULL;
+                        path->nodes[level] = NULL;
-                        *level = i + 1;
+                        level++;
                }
        }
        return 1;
 }
 /*
- * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * drop a subvolume tree.
- * the tree freeing any blocks that have a ref count of zero after being
+ *
- * decremented.
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
 */
-int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
-                        *root)
 {
-        int ret = 0;
-        int wret;
-        int level;
        struct btrfs_path *path;
-        int i;
+        struct btrfs_trans_handle *trans;
-        int orig_level;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
-        int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
+        struct walk_control *wc;
+        struct btrfs_key key;
+        int err = 0;
+        int ret;
+        int level;
-        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        level = btrfs_header_level(root->node);
+        wc = kzalloc(sizeof(*wc), GFP_NOFS);
-        orig_level = level;
+        BUG_ON(!wc);
+        trans = btrfs_start_transaction(tree_root, 1);
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
-                path->nodes[level] = root->node;
+                level = btrfs_header_level(root->node);
-                extent_buffer_get(root->node);
+                path->nodes[level] = btrfs_lock_root_node(root);
+                btrfs_set_lock_blocking(path->nodes[level]);
                path->slots[level] = 0;
+                path->locks[level] = 1;
+                memset(&wc->update_progress, 0,
+                       sizeof(wc->update_progress));
        } else {
-                struct btrfs_key key;
-                struct btrfs_disk_key found_key;
-                struct extent_buffer *node;
                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+                memcpy(&wc->update_progress, &key,
+                       sizeof(wc->update_progress));
                level = root_item->drop_level;
+                BUG_ON(level == 0);
                path->lowest_level = level;
-                wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                if (wret < 0) {
+                path->lowest_level = 0;
-                        ret = wret;
+                if (ret < 0) {
+                        err = ret;
                        goto out;
                }
-                node = path->nodes[level];
+                btrfs_node_key_to_cpu(path->nodes[level], &key,
-                btrfs_node_key(node, &found_key, path->slots[level]);
+                                      path->slots[level]);
-                WARN_ON(memcmp(&found_key, &root_item->drop_progress,
+                WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
-                               sizeof(found_key)));
                /*
                 * unlock our path, this is safe because only this
                 * function is allowed to delete this snapshot
                 */
-                for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+                btrfs_unlock_up_safe(path, 0);
-                        if (path->nodes[i] && path->locks[i]) {
-                                path->locks[i] = 0;
+                level = btrfs_header_level(root->node);
-                                btrfs_tree_unlock(path->nodes[i]);
+                while (1) {
-                        }
+                        btrfs_tree_lock(path->nodes[level]);
+                        btrfs_set_lock_blocking(path->nodes[level]);
+                        ret = btrfs_lookup_extent_info(trans, root,
+                                                path->nodes[level]->start,
+                                                path->nodes[level]->len,
+                                                &wc->refs[level],
+                                                &wc->flags[level]);
+                        BUG_ON(ret);
+                        BUG_ON(wc->refs[level] == 0);
+                        if (level == root_item->drop_level)
+                                break;
+                        btrfs_tree_unlock(path->nodes[level]);
+                        WARN_ON(wc->refs[level] != 1);
+                        level--;
                }
        }
+        wc->level = level;
+        wc->shared_level = -1;
+        wc->stage = DROP_REFERENCE;
+        wc->update_ref = update_ref;
+        wc->keep_locks = 0;
        while (1) {
-                unsigned long update;
+                ret = walk_down_tree(trans, root, path, wc);
-                wret = walk_down_tree(trans, root, path, &level);
+                if (ret < 0) {
-                if (wret > 0)
+                        err = ret;
                        break;
-                if (wret < 0)
+                }
-                        ret = wret;
-                wret = walk_up_tree(trans, root, path, &level,
+                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
-                                    BTRFS_MAX_LEVEL);
+                if (ret < 0) {
-                if (wret > 0)
+                        err = ret;
                        break;
-                if (wret < 0)
+                }
-                        ret = wret;
-                if (trans->transaction->in_commit ||
+                if (ret > 0) {
-                    trans->transaction->delayed_refs.flushing) {
+                        BUG_ON(wc->stage != DROP_REFERENCE);
-                        ret = -EAGAIN;
                        break;
                }
-                atomic_inc(&root->fs_info->throttle_gen);
-                wake_up(&root->fs_info->transaction_throttle);
+                if (wc->stage == DROP_REFERENCE) {
-                for (update_count = 0; update_count < 16; update_count++) {
+                        level = wc->level;
+                        btrfs_node_key(path->nodes[level],
+                                       &root_item->drop_progress,
+                                       path->slots[level]);
+                        root_item->drop_level = level;
+                }
+                BUG_ON(wc->level == 0);
+                if (trans->transaction->in_commit ||
+                    trans->transaction->delayed_refs.flushing) {
+                        ret = btrfs_update_root(trans, tree_root,
+                                                &root->root_key,
+                                                root_item);
+                        BUG_ON(ret);
+                        btrfs_end_transaction(trans, tree_root);
+                        trans = btrfs_start_transaction(tree_root, 1);
+                } else {
+                        unsigned long update;
                        update = trans->delayed_ref_updates;
                        trans->delayed_ref_updates = 0;
                        if (update)
-                                btrfs_run_delayed_refs(trans, root, update);
+                                btrfs_run_delayed_refs(trans, tree_root,
-                        else
+                                                       update);
-                                break;
-                }
-        }
-        for (i = 0; i <= orig_level; i++) {
-                if (path->nodes[i]) {
-                        free_extent_buffer(path->nodes[i]);
-                        path->nodes[i] = NULL;
                }
        }
+        btrfs_release_path(root, path);
+        BUG_ON(err);
+        ret = btrfs_del_root(trans, tree_root, &root->root_key);
+        BUG_ON(ret);
+        free_extent_buffer(root->node);
+        free_extent_buffer(root->commit_root);
+        kfree(root);
 out:
+        btrfs_end_transaction(trans, tree_root);
+        kfree(wc);
        btrfs_free_path(path);
-        return ret;
+        return err;
 }
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
                        struct extent_buffer *parent)
 {
        struct btrfs_path *path;
+        struct walk_control *wc;
        int level;
        int parent_level;
        int ret = 0;
        int wret;
+        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        wc = kzalloc(sizeof(*wc), GFP_NOFS);
+        BUG_ON(!wc);
        btrfs_assert_tree_locked(parent);
        parent_level = btrfs_header_level(parent);
        extent_buffer_get(parent);
@@ -3926,28 +5243,38 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        btrfs_assert_tree_locked(node);
        level = btrfs_header_level(node);
-        extent_buffer_get(node);
        path->nodes[level] = node;
        path->slots[level] = 0;
+        path->locks[level] = 1;
+        wc->refs[parent_level] = 1;
+        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        wc->level = level;
+        wc->shared_level = -1;
+        wc->stage = DROP_REFERENCE;
+        wc->update_ref = 0;
+        wc->keep_locks = 1;
        while (1) {
-                wret = walk_down_subtree(trans, root, path, &level);
+                wret = walk_down_tree(trans, root, path, wc);
-                if (wret < 0)
+                if (wret < 0) {
                        ret = wret;
-                if (wret != 0)
                        break;
+                }
-                wret = walk_up_tree(trans, root, path, &level, parent_level);
+                wret = walk_up_tree(trans, root, path, wc, parent_level);
                if (wret < 0)
                        ret = wret;
                if (wret != 0)
                        break;
        }
+        kfree(wc);
        btrfs_free_path(path);
        return ret;
 }
+#if 0
 static unsigned long calc_ra(unsigned long start, unsigned long last,
                             unsigned long nr)
 {
@@ -5429,6 +6756,7 @@ out:
        kfree(ref_path);
        return ret;
 }
+#endif
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
@@ -5477,7 +6805,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
        u64 calc;
        spin_lock(&shrink_block_group->lock);
-        if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+        if (btrfs_block_group_used(&shrink_block_group->item) +
+            shrink_block_group->reserved > 0) {
                spin_unlock(&shrink_block_group->lock);
                trans = btrfs_start_transaction(root, 1);
@@ -5502,6 +6831,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
        return 0;
 }
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+                                         struct btrfs_block_group_cache *group)
+{
+        __alloc_chunk_for_shrink(root, group, 1);
+        set_block_group_readonly(group);
+        return 0;
+}
+#if 0
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 u64 objectid, u64 size)
@@ -5781,6 +7121,7 @@ out:
        btrfs_free_path(path);
        return ret;
 }
+#endif
 static int find_first_block_group(struct btrfs_root *root,
                struct btrfs_path *path, struct btrfs_key *key)
@@ -5833,11 +7174,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                         &info->block_group_cache_tree);
                spin_unlock(&info->block_group_cache_lock);
-                btrfs_remove_free_space_cache(block_group);
                down_write(&block_group->space_info->groups_sem);
                list_del(&block_group->list);
                up_write(&block_group->space_info->groups_sem);
+                if (block_group->cached == BTRFS_CACHE_STARTED)
+                        wait_event(block_group->caching_q,
+                                   block_group_cache_done(block_group));
+                btrfs_remove_free_space_cache(block_group);
                WARN_ON(atomic_read(&block_group->count) != 1);
                kfree(block_group);
@@ -5903,9 +7249,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
                spin_lock_init(&cache->tree_lock);
-                mutex_init(&cache->cache_mutex);
+                cache->fs_info = info;
+                init_waitqueue_head(&cache->caching_q);
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
+                /*
+                 * we only want to have 32k of ram per block group for keeping
+                 * track of free space, and if we pass 1/2 of that we want to
+                 * start converting things over to using bitmaps
+                 */
+                cache->extents_thresh = ((1024 * 32) / 2) /
+                        sizeof(struct btrfs_free_space);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -5914,6 +7270,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                key.objectid = found_key.objectid + found_key.offset;
                btrfs_release_path(root, path);
                cache->flags = btrfs_block_group_flags(&cache->item);
+                cache->sectorsize = root->sectorsize;
+                remove_sb_from_cache(root, cache);
+                /*
+                 * check for two cases, either we are full, and therefore
+                 * don't need to bother with the caching work since we won't
+                 * find any space, or we are empty, and we can just add all
+                 * the space in and be done with it.  This saves us _alot_ of
+                 * time, particularly in the full case.
+                 */
+                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                } else if (btrfs_block_group_used(&cache->item) == 0) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                        add_new_free_space(cache, root->fs_info,
+                                           found_key.objectid,
+                                           found_key.objectid +
+                                           found_key.offset);
+                }
                ret = update_space_info(info, cache->flags, found_key.offset,
                                        btrfs_block_group_used(&cache->item),
@@ -5957,10 +7333,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.objectid = chunk_offset;
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+        cache->sectorsize = root->sectorsize;
+        /*
+         * we only want to have 32k of ram per block group for keeping track
+         * of free space, and if we pass 1/2 of that we want to start
+         * converting things over to using bitmaps
+         */
+        cache->extents_thresh = ((1024 * 32) / 2) /
+                sizeof(struct btrfs_free_space);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        spin_lock_init(&cache->tree_lock);
-        mutex_init(&cache->cache_mutex);
+        init_waitqueue_head(&cache->caching_q);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
@@ -5969,6 +7354,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->flags = type;
        btrfs_set_block_group_flags(&cache->item, type);
+        cache->cached = BTRFS_CACHE_FINISHED;
+        remove_sb_from_cache(root, cache);
+        add_new_free_space(cache, root->fs_info, chunk_offset,
+                           chunk_offset + size);
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
@@ -6027,7 +7418,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
        spin_unlock(&root->fs_info->block_group_cache_lock);
-        btrfs_remove_free_space_cache(block_group);
        down_write(&block_group->space_info->groups_sem);
        /*
         * we must use list_del_init so people can check to see if they
@@ -6036,11 +7427,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        list_del_init(&block_group->list);
        up_write(&block_group->space_info->groups_sem);
+        if (block_group->cached == BTRFS_CACHE_STARTED)
+                wait_event(block_group->caching_q,
+                           block_group_cache_done(block_group));
+        btrfs_remove_free_space_cache(block_group);
        spin_lock(&block_group->space_info->lock);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        spin_unlock(&block_group->space_info->lock);
-        block_group->space_info->full = 0;
+        btrfs_clear_space_info_full(root->fs_info);
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fe9eb990e443..68260180f587 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -476,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
        struct rb_node *node;
+        u64 last_end;
        int err;
        int set = 0;
@@ -498,6 +499,7 @@ again:
        if (state->start > end)
                goto out;
        WARN_ON(state->end < start);
+        last_end = state->end;
        /*
         *     | ---- desired range ---- |
@@ -524,9 +526,11 @@ again:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        start = state->end + 1;
                        set |= clear_state_bit(tree, state, bits,
                                        wake, delete);
+                        if (last_end == (u64)-1)
+                                goto out;
+                        start = last_end + 1;
                } else {
                        start = state->start;
                }
@@ -552,8 +556,10 @@ again:
                goto out;
        }
-        start = state->end + 1;
        set |= clear_state_bit(tree, state, bits, wake, delete);
+        if (last_end == (u64)-1)
+                goto out;
+        start = last_end + 1;
        goto search_again;
 out:
@@ -707,8 +713,10 @@ again:
                        goto out;
                }
                set_state_bits(tree, state, bits);
-                start = state->end + 1;
                merge_state(tree, state);
+                if (last_end == (u64)-1)
+                        goto out;
+                start = last_end + 1;
                goto search_again;
        }
@@ -742,8 +750,10 @@ again:
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, bits);
-                        start = state->end + 1;
                        merge_state(tree, state);
+                        if (last_end == (u64)-1)
+                                goto out;
+                        start = last_end + 1;
                } else {
                        start = state->start;
                }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1d51dc38bb49..4b833972273a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -22,7 +22,6 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
 #include <linux/swap.h>
@@ -151,7 +150,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        }
        if (end_pos > isize) {
                i_size_write(inode, end_pos);
-                btrfs_update_inode(trans, root, inode);
+                /* we've only changed i_size in ram, and we haven't updated
+                 * the disk i_size.  There is no need to log the inode
+                 * at this time.
+                 */
        }
        err = btrfs_end_transaction(trans, root);
 out_unlock:
@@ -291,16 +293,12 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 {
        u64 extent_end = 0;
        u64 search_start = start;
-        u64 leaf_start;
        u64 ram_bytes = 0;
-        u64 orig_parent = 0;
        u64 disk_bytenr = 0;
        u64 orig_locked_end = locked_end;
        u8 compression;
        u8 encryption;
        u16 other_encoding = 0;
-        u64 root_gen;
-        u64 root_owner;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *extent;
        struct btrfs_path *path;
@@ -340,9 +338,6 @@ next_slot:
                bookend = 0;
                found_extent = 0;
                found_inline = 0;
-                leaf_start = 0;
-                root_gen = 0;
-                root_owner = 0;
                compression = 0;
                encryption = 0;
                extent = NULL;
@@ -417,9 +412,6 @@ next_slot:
                if (found_extent) {
                        read_extent_buffer(leaf, &old, (unsigned long)extent,
                                           sizeof(old));
-                        root_gen = btrfs_header_generation(leaf);
-                        root_owner = btrfs_header_owner(leaf);
-                        leaf_start = leaf->start;
                }
                if (end < extent_end && end >= key.offset) {
@@ -443,14 +435,14 @@ next_slot:
                                }
                                locked_end = extent_end;
                        }
-                        orig_parent = path->nodes[0]->start;
                        disk_bytenr = le64_to_cpu(old.disk_bytenr);
                        if (disk_bytenr != 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
                                           disk_bytenr,
-                                           le64_to_cpu(old.disk_num_bytes),
+                                           le64_to_cpu(old.disk_num_bytes), 0,
-                                           orig_parent, root->root_key.objectid,
+                                           root->root_key.objectid,
-                                           trans->transid, inode->i_ino);
+                                           key.objectid, key.offset -
+                                           le64_to_cpu(old.offset));
                                BUG_ON(ret);
                        }
                }
@@ -568,17 +560,6 @@ next_slot:
                        btrfs_mark_buffer_dirty(path->nodes[0]);
                        btrfs_set_lock_blocking(path->nodes[0]);
-                        if (disk_bytenr != 0) {
-                                ret = btrfs_update_extent_ref(trans, root,
-                                                disk_bytenr,
-                                                le64_to_cpu(old.disk_num_bytes),
-                                                orig_parent,
-                                                leaf->start,
-                                                root->root_key.objectid,
-                                                trans->transid, ins.objectid);
-                                BUG_ON(ret);
-                        }
                        path->leave_spinning = 0;
                        btrfs_release_path(root, path);
                        if (disk_bytenr != 0)
@@ -594,8 +575,9 @@ next_slot:
                                ret = btrfs_free_extent(trans, root,
                                                old_disk_bytenr,
                                                le64_to_cpu(old.disk_num_bytes),
-                                                leaf_start, root_owner,
+                                                0, root->root_key.objectid,
-                                                root_gen, key.objectid, 0);
+                                                key.objectid, key.offset -
+                                                le64_to_cpu(old.offset));
                                BUG_ON(ret);
                                *hint_byte = old_disk_bytenr;
                        }
@@ -664,12 +646,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        u64 bytenr;
        u64 num_bytes;
        u64 extent_end;
-        u64 extent_offset;
+        u64 orig_offset;
        u64 other_start;
        u64 other_end;
        u64 split = start;
        u64 locked_end = end;
-        u64 orig_parent;
        int extent_type;
        int split_end = 1;
        int ret;
@@ -703,7 +684,7 @@ again:
        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-        extent_offset = btrfs_file_extent_offset(leaf, fi);
+        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
        if (key.offset == start)
                split = end;
@@ -711,8 +692,6 @@ again:
        if (key.offset == start && extent_end == end) {
                int del_nr = 0;
                int del_slot = 0;
-                u64 leaf_owner = btrfs_header_owner(leaf);
-                u64 leaf_gen = btrfs_header_generation(leaf);
                other_start = end;
                other_end = 0;
                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
@@ -721,8 +700,8 @@ again:
                        del_slot = path->slots[0] + 1;
                        del_nr++;
                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-                                                leaf->start, leaf_owner,
+                                                0, root->root_key.objectid,
-                                                leaf_gen, inode->i_ino, 0);
+                                                inode->i_ino, orig_offset);
                        BUG_ON(ret);
                }
                other_start = 0;
@@ -733,8 +712,8 @@ again:
                        del_slot = path->slots[0];
                        del_nr++;
                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-                                                leaf->start, leaf_owner,
+                                                0, root->root_key.objectid,
-                                                leaf_gen, inode->i_ino, 0);
+                                                inode->i_ino, orig_offset);
                        BUG_ON(ret);
                }
                split_end = 0;
@@ -768,13 +747,12 @@ again:
                        locked_end = extent_end;
                }
                btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
-                extent_offset += split - key.offset;
        } else  {
                BUG_ON(key.offset != start);
-                btrfs_set_file_extent_offset(leaf, fi, extent_offset +
-                                             split - key.offset);
-                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
                key.offset = split;
+                btrfs_set_file_extent_offset(leaf, fi, key.offset -
+                                             orig_offset);
+                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
                btrfs_set_item_key_safe(trans, root, path, &key);
                extent_end = split;
        }
@@ -793,7 +771,8 @@ again:
                                            struct btrfs_file_extent_item);
                        key.offset = split;
                        btrfs_set_item_key_safe(trans, root, path, &key);
-                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+                        btrfs_set_file_extent_offset(leaf, fi, key.offset -
+                                                     orig_offset);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        other_end - split);
                        goto done;
@@ -815,10 +794,9 @@ again:
        btrfs_mark_buffer_dirty(leaf);
-        orig_parent = leaf->start;
+        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
-        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+                                   root->root_key.objectid,
-                                   orig_parent, root->root_key.objectid,
+                                   inode->i_ino, orig_offset);
-                                   trans->transid, inode->i_ino);
        BUG_ON(ret);
        btrfs_release_path(root, path);
@@ -833,20 +811,12 @@ again:
        btrfs_set_file_extent_type(leaf, fi, extent_type);
        btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
        btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
-        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+        btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
        btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
        btrfs_set_file_extent_compression(leaf, fi, 0);
        btrfs_set_file_extent_encryption(leaf, fi, 0);
        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
-        if (orig_parent != leaf->start) {
-                ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-                                              orig_parent, leaf->start,
-                                              root->root_key.objectid,
-                                              trans->transid, inode->i_ino);
-                BUG_ON(ret);
-        }
 done:
        btrfs_mark_buffer_dirty(leaf);
@@ -1189,6 +1159,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
        root->log_batch++;
+        if (datasync && !(inode->i_state & I_DIRTY_PAGES))
+                goto out;
        /*
         * ok we haven't committed the transaction yet, lets do a commit
         */
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0bc93657b460..5edcee3a617f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
-struct btrfs_free_space {
+#define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
-        struct rb_node bytes_index;
+#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-        struct rb_node offset_index;
-        u64 offset;
-        u64 bytes;
-};
-static int tree_insert_offset(struct rb_root *root, u64 offset,
+static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
-                              struct rb_node *node)
+                                          u64 offset)
 {
-        struct rb_node **p = &root->rb_node;
+        BUG_ON(offset < bitmap_start);
-        struct rb_node *parent = NULL;
+        offset -= bitmap_start;
-        struct btrfs_free_space *info;
+        return (unsigned long)(div64_u64(offset, sectorsize));
+}
-        while (*p) {
+static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
-                parent = *p;
+{
-                info = rb_entry(parent, struct btrfs_free_space, offset_index);
+        return (unsigned long)(div64_u64(bytes, sectorsize));
+}
-                if (offset < info->offset)
+static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
-                        p = &(*p)->rb_left;
+                                   u64 offset)
-                else if (offset > info->offset)
+{
-                        p = &(*p)->rb_right;
+        u64 bitmap_start;
-                else
+        u64 bytes_per_bitmap;
-                        return -EEXIST;
-        }
-        rb_link_node(node, parent, p);
+        bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
-        rb_insert_color(node, root);
+        bitmap_start = offset - block_group->key.objectid;
+        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+        bitmap_start *= bytes_per_bitmap;
+        bitmap_start += block_group->key.objectid;
-        return 0;
+        return bitmap_start;
 }
-static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+static int tree_insert_offset(struct rb_root *root, u64 offset,
-                             struct rb_node *node)
+                              struct rb_node *node, int bitmap)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
        while (*p) {
                parent = *p;
-                info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+                info = rb_entry(parent, struct btrfs_free_space, offset_index);
-                if (bytes < info->bytes)
+                if (offset < info->offset) {
                        p = &(*p)->rb_left;
-                else
+                } else if (offset > info->offset) {
                        p = &(*p)->rb_right;
+                } else {
+                        /*
+                         * we could have a bitmap entry and an extent entry
+                         * share the same offset.  If this is the case, we want
+                         * the extent entry to always be found first if we do a
+                         * linear search through the tree, since we want to have
+                         * the quickest allocation time, and allocating from an
+                         * extent is faster than allocating from a bitmap.  So
+                         * if we're inserting a bitmap and we find an entry at
+                         * this offset, we want to go right, or after this entry
+                         * logically.  If we are inserting an extent and we've
+                         * found a bitmap, we want to go left, or before
+                         * logically.
+                         */
+                        if (bitmap) {
+                                WARN_ON(info->bitmap);
+                                p = &(*p)->rb_right;
+                        } else {
+                                WARN_ON(!info->bitmap);
+                                p = &(*p)->rb_left;
+                        }
+                }
        }
        rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 /*
 * searches the tree for the given offset.
 *
- * fuzzy == 1: this is used for allocations where we are given a hint of where
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
- * to look for free space.  Because the hint may not be completely on an offset
+ * want a section that has at least bytes size and comes at or after the given
- * mark, or the hint may no longer point to free space we need to fudge our
+ * offset.
- * results a bit.  So we look for free space starting at or after offset with at
- * least bytes size.  We prefer to find as close to the given offset as we can.
- * Also if the offset is within a free space range, then we will return the free
- * space that contains the given offset, which means we can return a free space
- * chunk with an offset before the provided offset.
- *
- * fuzzy == 0: this is just a normal tree search.  Give us the free space that
- * starts at the given offset which is at least bytes size, and if its not there
- * return NULL.
 */
-static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+static struct btrfs_free_space *
-                                                   u64 offset, u64 bytes,
+tree_search_offset(struct btrfs_block_group_cache *block_group,
-                                                   int fuzzy)
+                   u64 offset, int bitmap_only, int fuzzy)
 {
-        struct rb_node *n = root->rb_node;
+        struct rb_node *n = block_group->free_space_offset.rb_node;
-        struct btrfs_free_space *entry, *ret = NULL;
+        struct btrfs_free_space *entry, *prev = NULL;
+        /* find entry that is closest to the 'offset' */
+        while (1) {
+                if (!n) {
+                        entry = NULL;
+                        break;
+                }
-        while (n) {
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                prev = entry;
-                if (offset < entry->offset) {
+                if (offset < entry->offset)
-                        if (fuzzy &&
-                            (!ret || entry->offset < ret->offset) &&
-                            (bytes <= entry->bytes))
-                                ret = entry;
                        n = n->rb_left;
-                } else if (offset > entry->offset) {
+                else if (offset > entry->offset)
-                        if (fuzzy &&
-                            (entry->offset + entry->bytes - 1) >= offset &&
-                            bytes <= entry->bytes) {
-                                ret = entry;
-                                break;
-                        }
                        n = n->rb_right;
-                } else {
+                else
-                        if (bytes > entry->bytes) {
-                                n = n->rb_right;
-                                continue;
-                        }
-                        ret = entry;
                        break;
-                }
        }
-        return ret;
+        if (bitmap_only) {
-}
+                if (!entry)
+                        return NULL;
+                if (entry->bitmap)
+                        return entry;
-/*
+                /*
- * return a chunk at least bytes size, as close to offset that we can get.
+                 * bitmap entry and extent entry may share same offset,
- */
+                 * in that case, bitmap entry comes after extent entry.
-static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
+                 */
-                                                  u64 offset, u64 bytes)
+                n = rb_next(n);
-{
+                if (!n)
-        struct rb_node *n = root->rb_node;
+                        return NULL;
-        struct btrfs_free_space *entry, *ret = NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                if (entry->offset != offset)
-        while (n) {
+                        return NULL;
-                entry = rb_entry(n, struct btrfs_free_space, bytes_index);
-                if (bytes < entry->bytes) {
+                WARN_ON(!entry->bitmap);
+                return entry;
+        } else if (entry) {
+                if (entry->bitmap) {
                        /*
-                         * We prefer to get a hole size as close to the size we
+                         * if previous extent entry covers the offset,
-                         * are asking for so we don't take small slivers out of
+                         * we should return it instead of the bitmap entry
-                         * huge holes, but we also want to get as close to the
-                         * offset as possible so we don't have a whole lot of
-                         * fragmentation.
                         */
-                        if (offset <= entry->offset) {
+                        n = &entry->offset_index;
-                                if (!ret)
+                        while (1) {
-                                        ret = entry;
+                                n = rb_prev(n);
-                                else if (entry->bytes < ret->bytes)
+                                if (!n)
-                                        ret = entry;
+                                        break;
-                                else if (entry->offset < ret->offset)
+                                prev = rb_entry(n, struct btrfs_free_space,
-                                        ret = entry;
+                                                offset_index);
+                                if (!prev->bitmap) {
+                                        if (prev->offset + prev->bytes > offset)
+                                                entry = prev;
+                                        break;
+                                }
                        }
-                        n = n->rb_left;
+                }
-                } else if (bytes > entry->bytes) {
+                return entry;
-                        n = n->rb_right;
+        }
+        if (!prev)
+                return NULL;
+        /* find last entry before the 'offset' */
+        entry = prev;
+        if (entry->offset > offset) {
+                n = rb_prev(&entry->offset_index);
+                if (n) {
+                        entry = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        BUG_ON(entry->offset > offset);
                } else {
-                        /*
+                        if (fuzzy)
-                         * Ok we may have multiple chunks of the wanted size,
+                                return entry;
-                         * so we don't want to take the first one we find, we
+                        else
-                         * want to take the one closest to our given offset, so
+                                return NULL;
-                         * keep searching just in case theres a better match.
-                         */
-                        n = n->rb_right;
-                        if (offset > entry->offset)
-                                continue;
-                        else if (!ret || entry->offset < ret->offset)
-                                ret = entry;
                }
        }
-        return ret;
+        if (entry->bitmap) {
+                n = &entry->offset_index;
+                while (1) {
+                        n = rb_prev(n);
+                        if (!n)
+                                break;
+                        prev = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        if (!prev->bitmap) {
+                                if (prev->offset + prev->bytes > offset)
+                                        return prev;
+                                break;
+                        }
+                }
+                if (entry->offset + BITS_PER_BITMAP *
+                    block_group->sectorsize > offset)
+                        return entry;
+        } else if (entry->offset + entry->bytes > offset)
+                return entry;
+        if (!fuzzy)
+                return NULL;
+        while (1) {
+                if (entry->bitmap) {
+                        if (entry->offset + BITS_PER_BITMAP *
+                            block_group->sectorsize > offset)
+                                break;
+                } else {
+                        if (entry->offset + entry->bytes > offset)
+                                break;
+                }
+                n = rb_next(&entry->offset_index);
+                if (!n)
+                        return NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+        }
+        return entry;
 }
 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_free_space *info)
 {
        rb_erase(&info->offset_index, &block_group->free_space_offset);
-        rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+        block_group->free_extents--;
+        block_group->free_space -= info->bytes;
 }
 static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,353 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 {
        int ret = 0;
+        BUG_ON(!info->bitmap && !info->bytes);
-        BUG_ON(!info->bytes);
        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
-                                 &info->offset_index);
+                                 &info->offset_index, (info->bitmap != NULL));
        if (ret)
                return ret;
-        ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+        block_group->free_space += info->bytes;
-                                &info->bytes_index);
+        block_group->free_extents++;
-        if (ret)
+        return ret;
-                return ret;
+}
+static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+{
+        u64 max_bytes, possible_bytes;
+        /*
+         * The goal is to keep the total amount of memory used per 1gb of space
+         * at or below 32k, so we need to adjust how much memory we allow to be
+         * used by extent based free space tracking
+         */
+        max_bytes = MAX_CACHE_BYTES_PER_GIG *
+                (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+        possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+                (sizeof(struct btrfs_free_space) *
+                 block_group->extents_thresh);
+        if (possible_bytes > max_bytes) {
+                int extent_bytes = max_bytes -
+                        (block_group->total_bitmaps * PAGE_CACHE_SIZE);
+                if (extent_bytes <= 0) {
+                        block_group->extents_thresh = 0;
+                        return;
+                }
+                block_group->extents_thresh = extent_bytes /
+                        (sizeof(struct btrfs_free_space));
+        }
+}
+static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info, u64 offset,
+                              u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                clear_bit(i, info->bitmap);
+        info->bytes -= bytes;
+        block_group->free_space -= bytes;
+}
+static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_space *info, u64 offset,
+                            u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                set_bit(i, info->bitmap);
+        info->bytes += bytes;
+        block_group->free_space += bytes;
+}
+static int search_bitmap(struct btrfs_block_group_cache *block_group,
+                         struct btrfs_free_space *bitmap_info, u64 *offset,
+                         u64 *bytes)
+{
+        unsigned long found_bits = 0;
+        unsigned long bits, i;
+        unsigned long next_zero;
+        i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+                          max_t(u64, *offset, bitmap_info->offset));
+        bits = bytes_to_bits(*bytes, block_group->sectorsize);
+        for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(bitmap_info->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if ((next_zero - i) >= bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (found_bits) {
+                *offset = (u64)(i * block_group->sectorsize) +
+                        bitmap_info->offset;
+                *bytes = (u64)(found_bits) * block_group->sectorsize;
+                return 0;
+        }
+        return -1;
+}
+static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+                                                *block_group, u64 *offset,
+                                                u64 *bytes, int debug)
+{
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        int ret;
+        if (!block_group->free_space_offset.rb_node)
+                return NULL;
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, *offset),
+                                   0, 1);
+        if (!entry)
+                return NULL;
+        for (node = &entry->offset_index; node; node = rb_next(node)) {
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                if (entry->bytes < *bytes)
+                        continue;
+                if (entry->bitmap) {
+                        ret = search_bitmap(block_group, entry, offset, bytes);
+                        if (!ret)
+                                return entry;
+                        continue;
+                }
+                *offset = entry->offset;
+                *bytes = entry->bytes;
+                return entry;
+        }
+        return NULL;
+}
+static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_free_space *info, u64 offset)
+{
+        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        int max_bitmaps = (int)div64_u64(block_group->key.offset +
+                                         bytes_per_bg - 1, bytes_per_bg);
+        BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+        info->offset = offset_to_bitmap(block_group, offset);
+        link_free_space(block_group, info);
+        block_group->total_bitmaps++;
+        recalculate_thresholds(block_group);
+}
+static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *bitmap_info,
+                              u64 *offset, u64 *bytes)
+{
+        u64 end;
+        u64 search_start, search_bytes;
+        int ret;
+again:
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+        /*
+         * XXX - this can go away after a few releases.
+         *
+         * since the only user of btrfs_remove_free_space is the tree logging
+         * stuff, and the only way to test that is under crash conditions, we
+         * want to have this debug stuff here just in case somethings not
+         * working.  Search the bitmap for the space we are trying to use to
+         * make sure its actually there.  If its not there then we need to stop
+         * because something has gone wrong.
+         */
+        search_start = *offset;
+        search_bytes = *bytes;
+        ret = search_bitmap(block_group, bitmap_info, &search_start,
+                            &search_bytes);
+        BUG_ON(ret < 0 || search_start != *offset);
+        if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset,
+                                  end - *offset + 1);
+                *bytes -= end - *offset + 1;
+                *offset = end + 1;
+        } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+                *bytes = 0;
+        }
+        if (*bytes) {
+                struct rb_node *next = rb_next(&bitmap_info->offset_index);
+                if (!bitmap_info->bytes) {
+                        unlink_free_space(block_group, bitmap_info);
+                        kfree(bitmap_info->bitmap);
+                        kfree(bitmap_info);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+                /*
+                 * no entry after this bitmap, but we still have bytes to
+                 * remove, so something has gone wrong.
+                 */
+                if (!next)
+                        return -EINVAL;
+                bitmap_info = rb_entry(next, struct btrfs_free_space,
+                                       offset_index);
+                /*
+                 * if the next entry isn't a bitmap we need to return to let the
+                 * extent stuff do its work.
+                 */
+                if (!bitmap_info->bitmap)
+                        return -EAGAIN;
+                /*
+                 * Ok the next item is a bitmap, but it may not actually hold
+                 * the information for the rest of this free space stuff, so
+                 * look for it, and if we don't find it return so we can try
+                 * everything over again.
+                 */
+                search_start = *offset;
+                search_bytes = *bytes;
+                ret = search_bitmap(block_group, bitmap_info, &search_start,
+                                    &search_bytes);
+                if (ret < 0 || search_start != *offset)
+                        return -EAGAIN;
+                goto again;
+        } else if (!bitmap_info->bytes) {
+                unlink_free_space(block_group, bitmap_info);
+                kfree(bitmap_info->bitmap);
+                kfree(bitmap_info);
+                block_group->total_bitmaps--;
+                recalculate_thresholds(block_group);
+        }
+        return 0;
+}
+static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info)
+{
+        struct btrfs_free_space *bitmap_info;
+        int added = 0;
+        u64 bytes, offset, end;
+        int ret;
+        /*
+         * If we are below the extents threshold then we can add this as an
+         * extent, and don't have to deal with the bitmap
+         */
+        if (block_group->free_extents < block_group->extents_thresh &&
+            info->bytes > block_group->sectorsize * 4)
+                return 0;
+        /*
+         * some block groups are so tiny they can't be enveloped by a bitmap, so
+         * don't even bother to create a bitmap for this
+         */
+        if (BITS_PER_BITMAP * block_group->sectorsize >
+            block_group->key.offset)
+                return 0;
+        bytes = info->bytes;
+        offset = info->offset;
+again:
+        bitmap_info = tree_search_offset(block_group,
+                                         offset_to_bitmap(block_group, offset),
+                                         1, 0);
+        if (!bitmap_info) {
+                BUG_ON(added);
+                goto new_bitmap;
+        }
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+        if (offset >= bitmap_info->offset && offset + bytes > end) {
+                bitmap_set_bits(block_group, bitmap_info, offset,
+                                end - offset);
+                bytes -= end - offset;
+                offset = end;
+                added = 0;
+        } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
+                bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+                bytes = 0;
+        } else {
+                BUG();
+        }
+        if (!bytes) {
+                ret = 1;
+                goto out;
+        } else
+                goto again;
+new_bitmap:
+        if (info && info->bitmap) {
+                add_new_bitmap(block_group, info, offset);
+                added = 1;
+                info = NULL;
+                goto again;
+        } else {
+                spin_unlock(&block_group->tree_lock);
+                /* no pre-allocated info, allocate a new one */
+                if (!info) {
+                        info = kzalloc(sizeof(struct btrfs_free_space),
+                                       GFP_NOFS);
+                        if (!info) {
+                                spin_lock(&block_group->tree_lock);
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                }
+                /* allocate the bitmap */
+                info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+                spin_lock(&block_group->tree_lock);
+                if (!info->bitmap) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                goto again;
+        }
+out:
+        if (info) {
+                if (info->bitmap)
+                        kfree(info->bitmap);
+                kfree(info);
+        }
        return ret;
 }
@@ -208,8 +600,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 offset, u64 bytes)
 {
-        struct btrfs_free_space *right_info;
+        struct btrfs_free_space *right_info = NULL;
-        struct btrfs_free_space *left_info;
+        struct btrfs_free_space *left_info = NULL;
        struct btrfs_free_space *info = NULL;
        int ret = 0;
@@ -227,18 +619,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
-        right_info = tree_search_offset(&block_group->free_space_offset,
+        right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
-                                        offset+bytes, 0, 0);
+        if (right_info && rb_prev(&right_info->offset_index))
-        left_info = tree_search_offset(&block_group->free_space_offset,
+                left_info = rb_entry(rb_prev(&right_info->offset_index),
-                                       offset-1, 0, 1);
+                                     struct btrfs_free_space, offset_index);
+        else
+                left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+        /*
+         * If there was no extent directly to the left or right of this new
+         * extent then we know we're going to have to allocate a new extent, so
+         * before we do that see if we need to drop this into a bitmap
+         */
+        if ((!left_info || left_info->bitmap) &&
+            (!right_info || right_info->bitmap)) {
+                ret = insert_into_bitmap(block_group, info);
+                if (ret < 0) {
+                        goto out;
+                } else if (ret) {
+                        ret = 0;
+                        goto out;
+                }
+        }
-        if (right_info) {
+        if (right_info && !right_info->bitmap) {
                unlink_free_space(block_group, right_info);
                info->bytes += right_info->bytes;
                kfree(right_info);
        }
-        if (left_info && left_info->offset + left_info->bytes == offset) {
+        if (left_info && !left_info->bitmap &&
+            left_info->offset + left_info->bytes == offset) {
                unlink_free_space(block_group, left_info);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
@@ -248,11 +660,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
        ret = link_free_space(block_group, info);
        if (ret)
                kfree(info);
+out:
        spin_unlock(&block_group->tree_lock);
        if (ret) {
-                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+                printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
                BUG_ON(ret == -EEXIST);
        }
@@ -263,40 +675,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                            u64 offset, u64 bytes)
 {
        struct btrfs_free_space *info;
+        struct btrfs_free_space *next_info = NULL;
        int ret = 0;
        spin_lock(&block_group->tree_lock);
-        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+again:
-                                  1);
+        info = tree_search_offset(block_group, offset, 0, 0);
-        if (info && info->offset == offset) {
+        if (!info) {
-                if (info->bytes < bytes) {
+                /*
-                        printk(KERN_ERR "Found free space at %llu, size %llu,"
+                 * oops didn't find an extent that matched the space we wanted
-                               "trying to use %llu\n",
+                 * to remove, look for a bitmap instead
-                               (unsigned long long)info->offset,
+                 */
-                               (unsigned long long)info->bytes,
+                info = tree_search_offset(block_group,
-                               (unsigned long long)bytes);
+                                          offset_to_bitmap(block_group, offset),
+                                          1, 0);
+                if (!info) {
+                        WARN_ON(1);
+                        goto out_lock;
+                }
+        }
+        if (info->bytes < bytes && rb_next(&info->offset_index)) {
+                u64 end;
+                next_info = rb_entry(rb_next(&info->offset_index),
+                                             struct btrfs_free_space,
+                                             offset_index);
+                if (next_info->bitmap)
+                        end = next_info->offset + BITS_PER_BITMAP *
+                                block_group->sectorsize - 1;
+                else
+                        end = next_info->offset + next_info->bytes;
+                if (next_info->bytes < bytes ||
+                    next_info->offset > offset || offset > end) {
+                        printk(KERN_CRIT "Found free space at %llu, size %llu,"
+                              " trying to use %llu\n",
+                              (unsigned long long)info->offset,
+                              (unsigned long long)info->bytes,
+                              (unsigned long long)bytes);
                        WARN_ON(1);
                        ret = -EINVAL;
-                        spin_unlock(&block_group->tree_lock);
+                        goto out_lock;
-                        goto out;
                }
-                unlink_free_space(block_group, info);
-                if (info->bytes == bytes) {
+                info = next_info;
-                        kfree(info);
+        }
-                        spin_unlock(&block_group->tree_lock);
-                        goto out;
+        if (info->bytes == bytes) {
+                unlink_free_space(block_group, info);
+                if (info->bitmap) {
+                        kfree(info->bitmap);
+                        block_group->total_bitmaps--;
                }
+                kfree(info);
+                goto out_lock;
+        }
+        if (!info->bitmap && info->offset == offset) {
+                unlink_free_space(block_group, info);
                info->offset += bytes;
                info->bytes -= bytes;
+                link_free_space(block_group, info);
+                goto out_lock;
+        }
-                ret = link_free_space(block_group, info);
+        if (!info->bitmap && info->offset <= offset &&
-                spin_unlock(&block_group->tree_lock);
+            info->offset + info->bytes >= offset + bytes) {
-                BUG_ON(ret);
-        } else if (info && info->offset < offset &&
-                   info->offset + info->bytes >= offset + bytes) {
                u64 old_start = info->offset;
                /*
                 * we're freeing space in the middle of the info,
@@ -312,7 +758,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        info->offset = offset + bytes;
                        info->bytes = old_end - info->offset;
                        ret = link_free_space(block_group, info);
-                        BUG_ON(ret);
+                        WARN_ON(ret);
+                        if (ret)
+                                goto out_lock;
                } else {
                        /* the hole we're creating ends at the end
                         * of the info struct, just free the info
@@ -320,32 +768,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        kfree(info);
                }
                spin_unlock(&block_group->tree_lock);
-                /* step two, insert a new info struct to cover anything
-                 * before the hole
+                /* step two, insert a new info struct to cover
+                 * anything before the hole
                 */
                ret = btrfs_add_free_space(block_group, old_start,
                                           offset - old_start);
-                BUG_ON(ret);
+                WARN_ON(ret);
-        } else {
+                goto out;
-                spin_unlock(&block_group->tree_lock);
-                if (!info) {
-                        printk(KERN_ERR "couldn't find space %llu to free\n",
-                               (unsigned long long)offset);
-                        printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
-                               block_group->cached,
-                               (unsigned long long)block_group->key.objectid,
-                               (unsigned long long)block_group->key.offset);
-                        btrfs_dump_free_space(block_group, bytes);
-                } else if (info) {
-                        printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
-                               "but wanted offset=%llu bytes=%llu\n",
-                               (unsigned long long)info->offset,
-                               (unsigned long long)info->bytes,
-                               (unsigned long long)offset,
-                               (unsigned long long)bytes);
-                }
-                WARN_ON(1);
        }
+        ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+        if (ret == -EAGAIN)
+                goto again;
+        BUG_ON(ret);
+out_lock:
+        spin_unlock(&block_group->tree_lock);
 out:
        return ret;
 }
@@ -361,10 +799,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
-                printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+                printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
                       (unsigned long long)info->offset,
-                       (unsigned long long)info->bytes);
+                       (unsigned long long)info->bytes,
+                       (info->bitmap) ? "yes" : "no");
        }
+        printk(KERN_INFO "block group has cluster?: %s\n",
+               list_empty(&block_group->cluster_list) ? "no" : "yes");
        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
               "\n", count);
 }
@@ -397,26 +838,35 @@ __btrfs_return_cluster_to_free_space(
 {
        struct btrfs_free_space *entry;
        struct rb_node *node;
+        bool bitmap;
        spin_lock(&cluster->lock);
        if (cluster->block_group != block_group)
                goto out;
+        bitmap = cluster->points_to_bitmap;
+        cluster->block_group = NULL;
        cluster->window_start = 0;
+        list_del_init(&cluster->block_group_list);
+        cluster->points_to_bitmap = false;
+        if (bitmap)
+                goto out;
        node = rb_first(&cluster->root);
-        while(node) {
+        while (node) {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
-                link_free_space(block_group, entry);
+                BUG_ON(entry->bitmap);
+                tree_insert_offset(&block_group->free_space_offset,
+                                   entry->offset, &entry->offset_index, 0);
        }
-        list_del_init(&cluster->block_group_list);
-        btrfs_put_block_group(cluster->block_group);
-        cluster->block_group = NULL;
        cluster->root.rb_node = NULL;
 out:
        spin_unlock(&cluster->lock);
+        btrfs_put_block_group(block_group);
        return 0;
 }
@@ -425,20 +875,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
        struct btrfs_free_space *info;
        struct rb_node *node;
        struct btrfs_free_cluster *cluster;
-        struct btrfs_free_cluster *safe;
+        struct list_head *head;
        spin_lock(&block_group->tree_lock);
+        while ((head = block_group->cluster_list.next) !=
-        list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+               &block_group->cluster_list) {
-                                 block_group_list) {
+                cluster = list_entry(head, struct btrfs_free_cluster,
+                                     block_group_list);
                WARN_ON(cluster->block_group != block_group);
                __btrfs_return_cluster_to_free_space(block_group, cluster);
+                if (need_resched()) {
+                        spin_unlock(&block_group->tree_lock);
+                        cond_resched();
+                        spin_lock(&block_group->tree_lock);
+                }
        }
-        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+        while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
-                info = rb_entry(node, struct btrfs_free_space, bytes_index);
+                info = rb_entry(node, struct btrfs_free_space, offset_index);
                unlink_free_space(block_group, info);
+                if (info->bitmap)
+                        kfree(info->bitmap);
                kfree(info);
                if (need_resched()) {
                        spin_unlock(&block_group->tree_lock);
@@ -446,6 +904,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
                        spin_lock(&block_group->tree_lock);
                }
        }
        spin_unlock(&block_group->tree_lock);
 }
@@ -453,25 +912,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                               u64 offset, u64 bytes, u64 empty_size)
 {
        struct btrfs_free_space *entry = NULL;
+        u64 bytes_search = bytes + empty_size;
        u64 ret = 0;
        spin_lock(&block_group->tree_lock);
-        entry = tree_search_offset(&block_group->free_space_offset, offset,
+        entry = find_free_space(block_group, &offset, &bytes_search, 0);
-                                   bytes + empty_size, 1);
        if (!entry)
-                entry = tree_search_bytes(&block_group->free_space_bytes,
+                goto out;
-                                          offset, bytes + empty_size);
-        if (entry) {
+        ret = offset;
+        if (entry->bitmap) {
+                bitmap_clear_bits(block_group, entry, offset, bytes);
+                if (!entry->bytes) {
+                        unlink_free_space(block_group, entry);
+                        kfree(entry->bitmap);
+                        kfree(entry);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+        } else {
                unlink_free_space(block_group, entry);
-                ret = entry->offset;
                entry->offset += bytes;
                entry->bytes -= bytes;
                if (!entry->bytes)
                        kfree(entry);
                else
                        link_free_space(block_group, entry);
        }
+out:
        spin_unlock(&block_group->tree_lock);
        return ret;
@@ -517,6 +986,54 @@ int btrfs_return_cluster_to_free_space(
        return ret;
 }
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+                                   struct btrfs_free_cluster *cluster,
+                                   u64 bytes, u64 min_start)
+{
+        struct btrfs_free_space *entry;
+        int err;
+        u64 search_start = cluster->window_start;
+        u64 search_bytes = bytes;
+        u64 ret = 0;
+        spin_lock(&block_group->tree_lock);
+        spin_lock(&cluster->lock);
+        if (!cluster->points_to_bitmap)
+                goto out;
+        if (cluster->block_group != block_group)
+                goto out;
+        /*
+         * search_start is the beginning of the bitmap, but at some point it may
+         * be a good idea to point to the actual start of the free area in the
+         * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
+         * to 1 to make sure we get the bitmap entry
+         */
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, search_start),
+                                   1, 0);
+        if (!entry || !entry->bitmap)
+                goto out;
+        search_start = min_start;
+        search_bytes = bytes;
+        err = search_bitmap(block_group, entry, &search_start,
+                            &search_bytes);
+        if (err)
+                goto out;
+        ret = search_start;
+        bitmap_clear_bits(block_group, entry, ret, bytes);
+out:
+        spin_unlock(&cluster->lock);
+        spin_unlock(&block_group->tree_lock);
+        return ret;
+}
 /*
 * given a cluster, try to allocate 'bytes' from it, returns 0
 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +1047,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        struct rb_node *node;
        u64 ret = 0;
+        if (cluster->points_to_bitmap)
+                return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
+                                               min_start);
        spin_lock(&cluster->lock);
        if (bytes > cluster->max_size)
                goto out;
@@ -567,9 +1088,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        }
 out:
        spin_unlock(&cluster->lock);
        return ret;
 }
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+                                struct btrfs_free_space *entry,
+                                struct btrfs_free_cluster *cluster,
+                                u64 offset, u64 bytes, u64 min_bytes)
+{
+        unsigned long next_zero;
+        unsigned long i;
+        unsigned long search_bits;
+        unsigned long total_bits;
+        unsigned long found_bits;
+        unsigned long start = 0;
+        unsigned long total_found = 0;
+        bool found = false;
+        i = offset_to_bit(entry->offset, block_group->sectorsize,
+                          max_t(u64, offset, entry->offset));
+        search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+again:
+        found_bits = 0;
+        for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(entry->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if (next_zero - i >= search_bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (!found_bits)
+                return -1;
+        if (!found) {
+                start = i;
+                found = true;
+        }
+        total_found += found_bits;
+        if (cluster->max_size < found_bits * block_group->sectorsize)
+                cluster->max_size = found_bits * block_group->sectorsize;
+        if (total_found < total_bits) {
+                i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+                if (i - start > total_bits * 2) {
+                        total_found = 0;
+                        cluster->max_size = 0;
+                        found = false;
+                }
+                goto again;
+        }
+        cluster->window_start = start * block_group->sectorsize +
+                entry->offset;
+        cluster->points_to_bitmap = true;
+        return 0;
+}
 /*
 * here we try to find a cluster of blocks in a block group.  The goal
 * is to find at least bytes free and up to empty_size + bytes free.
@@ -579,6 +1164,7 @@ out:
 * it returns -enospc
 */
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster,
                             u64 offset, u64 bytes, u64 empty_size)
@@ -586,16 +1172,18 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        struct btrfs_free_space *entry = NULL;
        struct rb_node *node;
        struct btrfs_free_space *next;
-        struct btrfs_free_space *last;
+        struct btrfs_free_space *last = NULL;
        u64 min_bytes;
        u64 window_start;
        u64 window_free;
        u64 max_extent = 0;
-        int total_retries = 0;
+        bool found_bitmap = false;
        int ret;
        /* for metadata, allow allocates with more holes */
-        if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+        if (btrfs_test_opt(root, SSD_SPREAD)) {
+                min_bytes = bytes + empty_size;
+        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
                /*
                 * we want to do larger allocations when we are
                 * flushing out the delayed refs, it helps prevent
@@ -617,53 +1205,90 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
 again:
-        min_bytes = min(min_bytes, bytes + empty_size);
+        entry = tree_search_offset(block_group, offset, found_bitmap, 1);
-        entry = tree_search_bytes(&block_group->free_space_bytes,
-                                  offset, min_bytes);
        if (!entry) {
                ret = -ENOSPC;
                goto out;
        }
+        /*
+         * If found_bitmap is true, we exhausted our search for extent entries,
+         * and we just want to search all of the bitmaps that we can find, and
+         * ignore any extent entries we find.
+         */
+        while (entry->bitmap || found_bitmap ||
+               (!entry->bitmap && entry->bytes < min_bytes)) {
+                struct rb_node *node = rb_next(&entry->offset_index);
+                if (entry->bitmap && entry->bytes > bytes + empty_size) {
+                        ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+                                                   offset, bytes + empty_size,
+                                                   min_bytes);
+                        if (!ret)
+                                goto got_it;
+                }
+                if (!node) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        }
+        /*
+         * We already searched all the extent entries from the passed in offset
+         * to the end and didn't find enough space for the cluster, and we also
+         * didn't find any bitmaps that met our criteria, just go ahead and exit
+         */
+        if (found_bitmap) {
+                ret = -ENOSPC;
+                goto out;
+        }
+        cluster->points_to_bitmap = false;
        window_start = entry->offset;
        window_free = entry->bytes;
        last = entry;
        max_extent = entry->bytes;
-        while(1) {
+        while (1) {
                /* out window is just right, lets fill it */
                if (window_free >= bytes + empty_size)
                        break;
                node = rb_next(&last->offset_index);
                if (!node) {
+                        if (found_bitmap)
+                                goto again;
                        ret = -ENOSPC;
                        goto out;
                }
                next = rb_entry(node, struct btrfs_free_space, offset_index);
                /*
+                 * we found a bitmap, so if this search doesn't result in a
+                 * cluster, we know to go and search again for the bitmaps and
+                 * start looking for space there
+                 */
+                if (next->bitmap) {
+                        if (!found_bitmap)
+                                offset = next->offset;
+                        found_bitmap = true;
+                        last = next;
+                        continue;
+                }
+                /*
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
                 */
-                if (next->offset - window_start > (bytes + empty_size) * 2) {
+                if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
+                    next->offset - window_start > (bytes + empty_size) * 2) {
                        entry = next;
                        window_start = entry->offset;
                        window_free = entry->bytes;
                        last = entry;
                        max_extent = 0;
-                        total_retries++;
-                        if (total_retries % 256 == 0) {
-                                if (min_bytes >= (bytes + empty_size)) {
-                                        ret = -ENOSPC;
-                                        goto out;
-                                }
-                                /*
-                                 * grow our allocation a bit, we're not having
-                                 * much luck
-                                 */
-                                min_bytes *= 2;
-                                goto again;
-                        }
                } else {
                        last = next;
                        window_free += next->bytes;
@@ -681,11 +1306,19 @@ again:
         * The cluster includes an rbtree, but only uses the offset index
         * of each free space cache entry.
         */
-        while(1) {
+        while (1) {
                node = rb_next(&entry->offset_index);
-                unlink_free_space(block_group, entry);
+                if (entry->bitmap && node) {
+                        entry = rb_entry(node, struct btrfs_free_space,
+                                         offset_index);
+                        continue;
+                } else if (entry->bitmap && !node) {
+                        break;
+                }
+                rb_erase(&entry->offset_index, &block_group->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
-                                         &entry->offset_index);
+                                         &entry->offset_index, 0);
                BUG_ON(ret);
                if (!node || entry == last)
@@ -693,8 +1326,10 @@ again:
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
        }
-        ret = 0;
        cluster->max_size = max_extent;
+got_it:
+        ret = 0;
        atomic_inc(&block_group->count);
        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
        cluster->block_group = block_group;
@@ -714,6 +1349,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        spin_lock_init(&cluster->refill_lock);
        cluster->root.rb_node = NULL;
        cluster->max_size = 0;
+        cluster->points_to_bitmap = false;
        INIT_LIST_HEAD(&cluster->block_group_list);
        cluster->block_group = NULL;
 }
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ab0bdc0a63ce..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
 #ifndef __BTRFS_FREE_SPACE_CACHE
 #define __BTRFS_FREE_SPACE_CACHE
+struct btrfs_free_space {
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+        unsigned long *bitmap;
+        struct list_head list;
+};
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
@@ -31,6 +39,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes);
 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster,
                             u64 offset, u64 bytes, u64 empty_size);
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 2a020b276768..db2ff9773b99 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,9 +19,9 @@
 #ifndef __HASH__
 #define __HASH__
-#include "crc32c.h"
+#include <linux/crc32c.h>
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
-        return btrfs_crc32c((u32)~1, name, len);
+        return crc32c((u32)~1, name, len);
 }
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c8b0190d031..59cba180fe83 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -26,7 +26,6 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
 #include <linux/swap.h>
@@ -48,7 +47,6 @@
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
-#include "ref-cache.h"
 #include "compression.h"
 #include "locking.h"
@@ -369,7 +367,7 @@ again:
         * inode has not been flagged as nocompress.  This flag can
         * change at any time if we discover bad compression ratios.
         */
-        if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
            btrfs_test_opt(root, COMPRESS)) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -470,7 +468,7 @@ again:
                nr_pages_ret = 0;
                /* flag the file so we don't compress in the future */
-                btrfs_set_flag(inode, NOCOMPRESS);
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
        }
        if (will_compress) {
                *num_added += 1;
@@ -863,7 +861,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->locked_page = locked_page;
                async_cow->start = start;
-                if (btrfs_test_flag(inode, NOCOMPRESS))
+                if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
                        cur_end = end;
                else
                        cur_end = min(end, start + 512 * 1024 - 1);
@@ -944,6 +942,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        u64 cow_start;
        u64 cur_offset;
        u64 extent_end;
+        u64 extent_offset;
        u64 disk_bytenr;
        u64 num_bytes;
        int extent_type;
@@ -1005,6 +1004,7 @@ next_slot:
                if (extent_type == BTRFS_FILE_EXTENT_REG ||
                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                        extent_offset = btrfs_file_extent_offset(leaf, fi);
                        extent_end = found_key.offset +
                                btrfs_file_extent_num_bytes(leaf, fi);
                        if (extent_end <= start) {
@@ -1022,9 +1022,10 @@ next_slot:
                        if (btrfs_extent_readonly(root, disk_bytenr))
                                goto out_check;
                        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
-                                                  disk_bytenr))
+                                                  found_key.offset -
+                                                  extent_offset, disk_bytenr))
                                goto out_check;
-                        disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+                        disk_bytenr += extent_offset;
                        disk_bytenr += cur_offset - found_key.offset;
                        num_bytes = min(end + 1, extent_end) - cur_offset;
                        /*
@@ -1131,10 +1132,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        int ret;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        if (btrfs_test_flag(inode, NODATACOW))
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 1, nr_written);
-        else if (btrfs_test_flag(inode, PREALLOC))
+        else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
        else if (!btrfs_test_opt(root, COMPRESS))
@@ -1288,7 +1289,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        int ret = 0;
        int skip_sum;
-        skip_sum = btrfs_test_flag(inode, NODATASUM);
+        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
        BUG_ON(ret);
@@ -1489,9 +1490,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        ins.objectid = disk_bytenr;
        ins.offset = disk_num_bytes;
        ins.type = BTRFS_EXTENT_ITEM_KEY;
-        ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+        ret = btrfs_alloc_reserved_file_extent(trans, root,
-                                          root->root_key.objectid,
+                                        root->root_key.objectid,
-                                          trans->transid, inode->i_ino, &ins);
+                                        inode->i_ino, file_pos, &ins);
        BUG_ON(ret);
        btrfs_free_path(path);
@@ -1788,7 +1789,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                ClearPageChecked(page);
                goto good;
        }
-        if (btrfs_test_flag(inode, NODATASUM))
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                return 0;
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
@@ -1956,23 +1958,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * crossing root thing.  we store the inode number in the
                 * offset of the orphan item.
                 */
-                inode = btrfs_iget_locked(root->fs_info->sb,
+                found_key.objectid = found_key.offset;
-                                          found_key.offset, root);
+                found_key.type = BTRFS_INODE_ITEM_KEY;
-                if (!inode)
+                found_key.offset = 0;
+                inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+                if (IS_ERR(inode))
                        break;
-                if (inode->i_state & I_NEW) {
-                        BTRFS_I(inode)->root = root;
-                        /* have to set the location manually */
-                        BTRFS_I(inode)->location.objectid = inode->i_ino;
-                        BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-                        BTRFS_I(inode)->location.offset = 0;
-                        btrfs_read_locked_inode(inode);
-                        unlock_new_inode(inode);
-                }
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
@@ -2069,7 +2061,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
 * read an inode from the btree into the in-memory inode
 */
-void btrfs_read_locked_inode(struct inode *inode)
+static void btrfs_read_locked_inode(struct inode *inode)
 {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -2129,10 +2121,8 @@ void btrfs_read_locked_inode(struct inode *inode)
         * any xattrs or acls
         */
        maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
-        if (!maybe_acls) {
+        if (!maybe_acls)
-                BTRFS_I(inode)->i_acl = NULL;
+                cache_no_acl(inode);
-                BTRFS_I(inode)->i_default_acl = NULL;
-        }
        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
                                                alloc_group_block, 0);
@@ -2164,6 +2154,8 @@ void btrfs_read_locked_inode(struct inode *inode)
                init_special_inode(inode, inode->i_mode, rdev);
                break;
        }
+        btrfs_update_iflags(inode);
        return;
 make_bad:
@@ -2327,7 +2319,6 @@ err:
        btrfs_update_inode(trans, root, dir);
        btrfs_drop_nlink(inode);
        ret = btrfs_update_inode(trans, root, inode);
-        dir->i_sb->s_dirt = 1;
 out:
        return ret;
 }
@@ -2599,9 +2590,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        struct btrfs_file_extent_item *fi;
        u64 extent_start = 0;
        u64 extent_num_bytes = 0;
+        u64 extent_offset = 0;
        u64 item_end = 0;
-        u64 root_gen = 0;
-        u64 root_owner = 0;
        int found_extent;
        int del_item;
        int pending_del_nr = 0;
@@ -2613,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        if (root->ref_cows)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
        path = btrfs_alloc_path();
-        path->reada = -1;
        BUG_ON(!path);
+        path->reada = -1;
        /* FIXME, add redo link to tree so we don't leak on crash */
        key.objectid = inode->i_ino;
@@ -2716,6 +2706,9 @@ search_again:
                                extent_num_bytes =
                                        btrfs_file_extent_disk_num_bytes(leaf,
                                                                         fi);
+                                extent_offset = found_key.offset -
+                                        btrfs_file_extent_offset(leaf, fi);
                                /* FIXME blocksize != 4096 */
                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
                                if (extent_start != 0) {
@@ -2723,8 +2716,6 @@ search_again:
                                        if (root->ref_cows)
                                                inode_sub_bytes(inode, num_dec);
                                }
-                                root_gen = btrfs_header_generation(leaf);
-                                root_owner = btrfs_header_owner(leaf);
                        }
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                        /*
@@ -2768,12 +2759,12 @@ delete:
                } else {
                        break;
                }
-                if (found_extent) {
+                if (found_extent && root->ref_cows) {
                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
-                                                extent_num_bytes,
+                                                extent_num_bytes, 0,
-                                                leaf->start, root_owner,
+                                                btrfs_header_owner(leaf),
-                                                root_gen, inode->i_ino, 0);
+                                                inode->i_ino, extent_offset);
                        BUG_ON(ret);
                }
 next:
@@ -2811,7 +2802,6 @@ error:
                                      pending_del_nr);
        }
        btrfs_free_path(path);
-        inode->i_sb->s_dirt = 1;
        return ret;
 }
@@ -3105,13 +3095,56 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        return 0;
 }
+static void inode_tree_add(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_inode *entry;
+        struct rb_node **p;
+        struct rb_node *parent;
+again:
+        p = &root->inode_tree.rb_node;
+        parent = NULL;
+        spin_lock(&root->inode_lock);
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct btrfs_inode, rb_node);
+                if (inode->i_ino < entry->vfs_inode.i_ino)
+                        p = &parent->rb_left;
+                else if (inode->i_ino > entry->vfs_inode.i_ino)
+                        p = &parent->rb_right;
+                else {
+                        WARN_ON(!(entry->vfs_inode.i_state &
+                                  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+                        rb_erase(parent, &root->inode_tree);
+                        RB_CLEAR_NODE(parent);
+                        spin_unlock(&root->inode_lock);
+                        goto again;
+                }
+        }
+        rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
+        rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+        spin_unlock(&root->inode_lock);
+}
+static void inode_tree_del(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        spin_lock(&root->inode_lock);
+        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+        }
+        spin_unlock(&root->inode_lock);
+}
 static noinline void init_btrfs_i(struct inode *inode)
 {
        struct btrfs_inode *bi = BTRFS_I(inode);
-        bi->i_acl = BTRFS_ACL_NOT_CACHED;
-        bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
        bi->generation = 0;
        bi->sequence = 0;
        bi->last_trans = 0;
@@ -3130,6 +3163,7 @@ static noinline void init_btrfs_i(struct inode *inode)
                             inode->i_mapping, GFP_NOFS);
        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
+        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
        mutex_init(&BTRFS_I(inode)->extent_mutex);
        mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3152,26 +3186,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
                args->root == BTRFS_I(inode)->root;
 }
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+static struct inode *btrfs_iget_locked(struct super_block *s,
-                            struct btrfs_root *root, int wait)
+                                       u64 objectid,
-{
+                                       struct btrfs_root *root)
-        struct inode *inode;
-        struct btrfs_iget_args args;
-        args.ino = objectid;
-        args.root = root;
-        if (wait) {
-                inode = ilookup5(s, objectid, btrfs_find_actor,
-                                 (void *)&args);
-        } else {
-                inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
-                                        (void *)&args);
-        }
-        return inode;
-}
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-                                struct btrfs_root *root)
 {
        struct inode *inode;
        struct btrfs_iget_args args;
@@ -3188,24 +3205,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 * Returns in *is_new if the inode was read from disk
 */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-                         struct btrfs_root *root, int *is_new)
+                         struct btrfs_root *root)
 {
        struct inode *inode;
        inode = btrfs_iget_locked(s, location->objectid, root);
        if (!inode)
-                return ERR_PTR(-EACCES);
+                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
                BTRFS_I(inode)->root = root;
                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
                btrfs_read_locked_inode(inode);
+                inode_tree_add(inode);
                unlock_new_inode(inode);
-                if (is_new)
-                        *is_new = 1;
-        } else {
-                if (is_new)
-                        *is_new = 0;
        }
        return inode;
@@ -3218,7 +3232,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        struct btrfs_root *root = bi->root;
        struct btrfs_root *sub_root = root;
        struct btrfs_key location;
-        int ret, new;
+        int ret;
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -3236,7 +3250,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                        return ERR_PTR(ret);
                if (ret > 0)
                        return ERR_PTR(-ENOENT);
-                inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+                inode = btrfs_iget(dir->i_sb, &location, sub_root);
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
@@ -3572,12 +3586,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                owner = 1;
        BTRFS_I(inode)->block_group =
                        btrfs_find_block_group(root, 0, alloc_hint, owner);
-        if ((mode & S_IFREG)) {
-                if (btrfs_test_opt(root, NODATASUM))
-                        btrfs_set_flag(inode, NODATASUM);
-                if (btrfs_test_opt(root, NODATACOW))
-                        btrfs_set_flag(inode, NODATACOW);
-        }
        key[0].objectid = objectid;
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -3630,7 +3638,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        location->offset = 0;
        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+        btrfs_inherit_iflags(inode, dir);
+        if ((mode & S_IFREG)) {
+                if (btrfs_test_opt(root, NODATASUM))
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+                if (btrfs_test_opt(root, NODATACOW))
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+        }
        insert_inode_hash(inode);
+        inode_tree_add(inode);
        return inode;
 fail:
        if (dir)
@@ -3750,7 +3768,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                init_special_inode(inode, inode->i_mode, rdev);
                btrfs_update_inode(trans, root, inode);
        }
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3815,7 +3832,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3862,7 +3878,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (err)
                drop_inode = 1;
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, dir);
        err = btrfs_update_inode(trans, root, inode);
@@ -3944,7 +3959,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        drop_on_err = 0;
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
@@ -4628,8 +4642,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_trans = 0;
        ei->logged_trans = 0;
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-        ei->i_acl = BTRFS_ACL_NOT_CACHED;
-        ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
        INIT_LIST_HEAD(&ei->i_orphan);
        INIT_LIST_HEAD(&ei->ordered_operations);
        return &ei->vfs_inode;
@@ -4643,13 +4655,6 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
-        if (BTRFS_I(inode)->i_acl &&
-            BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
-                posix_acl_release(BTRFS_I(inode)->i_acl);
-        if (BTRFS_I(inode)->i_default_acl &&
-            BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
-                posix_acl_release(BTRFS_I(inode)->i_default_acl);
        /*
         * Make sure we're properly removed from the ordered operation
         * lists.
@@ -4683,6 +4688,7 @@ void btrfs_destroy_inode(struct inode *inode)
                        btrfs_put_ordered_extent(ordered);
                }
        }
+        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
@@ -4786,8 +4792,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * and the replacement file is large.  Start IO on it now so
         * we don't add too much work to the end of the transaction
         */
-        if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
-            new_inode->i_size &&
            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                filemap_flush(old_inode->i_mapping);
@@ -4972,7 +4977,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
        if (drop_inode)
@@ -5061,7 +5065,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 out:
        if (cur_offset > start) {
                inode->i_ctime = CURRENT_TIME;
-                btrfs_set_flag(inode, PREALLOC);
+                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    cur_offset > i_size_read(inode))
                        btrfs_i_size_write(inode, cur_offset);
@@ -5084,6 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
        int ret;
        alloc_start = offset & ~mask;
@@ -5102,6 +5107,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
+        root = BTRFS_I(inode)->root;
+        ret = btrfs_check_data_free_space(root, inode,
+                                          alloc_end - alloc_start);
+        if (ret)
+                goto out;
        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
@@ -5109,7 +5121,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
                if (!trans) {
                        ret = -EIO;
-                        goto out;
+                        goto out_free;
                }
                /* the extent lock is ordered inside the running
@@ -5170,6 +5182,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                      GFP_NOFS);
        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+out_free:
+        btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
@@ -5182,7 +5196,7 @@ static int btrfs_set_page_dirty(struct page *page)
 static int btrfs_permission(struct inode *inode, int mask)
 {
-        if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
        return generic_permission(inode, mask, btrfs_check_acl);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2624b53ea783..bd88f25889f7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,7 +27,6 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
 #include <linux/mpage.h>
@@ -50,7 +49,177 @@
 #include "volumes.h"
 #include "locking.h"
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & ~FS_DIRSYNC_FL;
+        else
+                return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+/*
+ * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ */
+static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+{
+        unsigned int iflags = 0;
+        if (flags & BTRFS_INODE_SYNC)
+                iflags |= FS_SYNC_FL;
+        if (flags & BTRFS_INODE_IMMUTABLE)
+                iflags |= FS_IMMUTABLE_FL;
+        if (flags & BTRFS_INODE_APPEND)
+                iflags |= FS_APPEND_FL;
+        if (flags & BTRFS_INODE_NODUMP)
+                iflags |= FS_NODUMP_FL;
+        if (flags & BTRFS_INODE_NOATIME)
+                iflags |= FS_NOATIME_FL;
+        if (flags & BTRFS_INODE_DIRSYNC)
+                iflags |= FS_DIRSYNC_FL;
+        return iflags;
+}
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_update_iflags(struct inode *inode)
+{
+        struct btrfs_inode *ip = BTRFS_I(inode);
+        inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+        if (ip->flags & BTRFS_INODE_SYNC)
+                inode->i_flags |= S_SYNC;
+        if (ip->flags & BTRFS_INODE_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        if (ip->flags & BTRFS_INODE_APPEND)
+                inode->i_flags |= S_APPEND;
+        if (ip->flags & BTRFS_INODE_NOATIME)
+                inode->i_flags |= S_NOATIME;
+        if (ip->flags & BTRFS_INODE_DIRSYNC)
+                inode->i_flags |= S_DIRSYNC;
+}
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Unlike extN we don't have any flags we don't want to inherit currently.
+ */
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+        unsigned int flags;
+        if (!dir)
+                return;
+        flags = BTRFS_I(dir)->flags;
+        if (S_ISREG(inode->i_mode))
+                flags &= ~BTRFS_INODE_DIRSYNC;
+        else if (!S_ISDIR(inode->i_mode))
+                flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+        BTRFS_I(inode)->flags = flags;
+        btrfs_update_iflags(inode);
+}
+static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
+{
+        struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+        unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+        if (copy_to_user(arg, &flags, sizeof(flags)))
+                return -EFAULT;
+        return 0;
+}
+static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct btrfs_inode *ip = BTRFS_I(inode);
+        struct btrfs_root *root = ip->root;
+        struct btrfs_trans_handle *trans;
+        unsigned int flags, oldflags;
+        int ret;
+        if (copy_from_user(&flags, arg, sizeof(flags)))
+                return -EFAULT;
+        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+                      FS_NOATIME_FL | FS_NODUMP_FL | \
+                      FS_SYNC_FL | FS_DIRSYNC_FL))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EACCES;
+        mutex_lock(&inode->i_mutex);
+        flags = btrfs_mask_flags(inode->i_mode, flags);
+        oldflags = btrfs_flags_to_ioctl(ip->flags);
+        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+                if (!capable(CAP_LINUX_IMMUTABLE)) {
+                        ret = -EPERM;
+                        goto out_unlock;
+                }
+        }
+        ret = mnt_want_write(file->f_path.mnt);
+        if (ret)
+                goto out_unlock;
+        if (flags & FS_SYNC_FL)
+                ip->flags |= BTRFS_INODE_SYNC;
+        else
+                ip->flags &= ~BTRFS_INODE_SYNC;
+        if (flags & FS_IMMUTABLE_FL)
+                ip->flags |= BTRFS_INODE_IMMUTABLE;
+        else
+                ip->flags &= ~BTRFS_INODE_IMMUTABLE;
+        if (flags & FS_APPEND_FL)
+                ip->flags |= BTRFS_INODE_APPEND;
+        else
+                ip->flags &= ~BTRFS_INODE_APPEND;
+        if (flags & FS_NODUMP_FL)
+                ip->flags |= BTRFS_INODE_NODUMP;
+        else
+                ip->flags &= ~BTRFS_INODE_NODUMP;
+        if (flags & FS_NOATIME_FL)
+                ip->flags |= BTRFS_INODE_NOATIME;
+        else
+                ip->flags &= ~BTRFS_INODE_NOATIME;
+        if (flags & FS_DIRSYNC_FL)
+                ip->flags |= BTRFS_INODE_DIRSYNC;
+        else
+                ip->flags &= ~BTRFS_INODE_DIRSYNC;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(!trans);
+        ret = btrfs_update_inode(trans, root, inode);
+        BUG_ON(ret);
+        btrfs_update_iflags(inode);
+        inode->i_ctime = CURRENT_TIME;
+        btrfs_end_transaction(trans, root);
+        mnt_drop_write(file->f_path.mnt);
+ out_unlock:
+        mutex_unlock(&inode->i_mutex);
+        return 0;
+}
+static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        return put_user(inode->i_generation, arg);
+}
 static noinline int create_subvol(struct btrfs_root *root,
                                  struct dentry *dentry,
@@ -82,22 +251,25 @@ static noinline int create_subvol(struct btrfs_root *root,
        if (ret)
                goto fail;
-        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-                                      objectid, trans->transid, 0, 0, 0);
+                                      0, objectid, NULL, 0, 0, 0);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                goto fail;
        }
-        btrfs_set_header_nritems(leaf, 0);
+        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
-        btrfs_set_header_level(leaf, 0);
        btrfs_set_header_bytenr(leaf, leaf->start);
        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(leaf, objectid);
        write_extent_buffer(leaf, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(leaf),
                            BTRFS_FSID_SIZE);
+        write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+                            (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+                            BTRFS_UUID_SIZE);
        btrfs_mark_buffer_dirty(leaf);
        inode_item = &root_item.inode;
@@ -125,7 +297,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        btrfs_set_root_dirid(&root_item, new_dirid);
        key.objectid = objectid;
-        key.offset = 1;
+        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                &root_item);
@@ -855,7 +1027,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                                struct btrfs_file_extent_item);
                        comp = btrfs_file_extent_compression(leaf, extent);
                        type = btrfs_file_extent_type(leaf, extent);
-                        if (type == BTRFS_FILE_EXTENT_REG) {
+                        if (type == BTRFS_FILE_EXTENT_REG ||
+                            type == BTRFS_FILE_EXTENT_PREALLOC) {
                                disko = btrfs_file_extent_disk_bytenr(leaf,
                                                                      extent);
                                diskl = btrfs_file_extent_disk_num_bytes(leaf,
@@ -878,7 +1051,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        new_key.objectid = inode->i_ino;
                        new_key.offset = key.offset + destoff - off;
-                        if (type == BTRFS_FILE_EXTENT_REG) {
+                        if (type == BTRFS_FILE_EXTENT_REG ||
+                            type == BTRFS_FILE_EXTENT_PREALLOC) {
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
                                if (ret)
@@ -911,10 +1085,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                if (disko) {
                                        inode_add_bytes(inode, datal);
                                        ret = btrfs_inc_extent_ref(trans, root,
-                                                   disko, diskl, leaf->start,
+                                                        disko, diskl, 0,
-                                                   root->root_key.objectid,
+                                                        root->root_key.objectid,
-                                                   trans->transid,
+                                                        inode->i_ino,
-                                                   inode->i_ino);
+                                                        new_key.offset - datao);
                                        BUG_ON(ret);
                                }
                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -1074,6 +1248,12 @@ long btrfs_ioctl(struct file *file, unsigned int
        void __user *argp = (void __user *)arg;
        switch (cmd) {
+        case FS_IOC_GETFLAGS:
+                return btrfs_ioctl_getflags(file, argp);
+        case FS_IOC_SETFLAGS:
+                return btrfs_ioctl_setflags(file, argp);
+        case FS_IOC_GETVERSION:
+                return btrfs_ioctl_getversion(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5f8f218c1005..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb,
               (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
               (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
 }
+static void print_extent_data_ref(struct extent_buffer *eb,
+                                  struct btrfs_extent_data_ref *ref)
+{
+        printk(KERN_INFO "\t\textent data backref root %llu "
+               "objectid %llu offset %llu count %u\n",
+               (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
+               (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
+               (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+               btrfs_extent_data_ref_count(eb, ref));
+}
+static void print_extent_item(struct extent_buffer *eb, int slot)
+{
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *iref;
+        struct btrfs_extent_data_ref *dref;
+        struct btrfs_shared_data_ref *sref;
+        struct btrfs_disk_key key;
+        unsigned long end;
+        unsigned long ptr;
+        int type;
+        u32 item_size = btrfs_item_size_nr(eb, slot);
+        u64 flags;
+        u64 offset;
+        if (item_size < sizeof(*ei)) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                struct btrfs_extent_item_v0 *ei0;
+                BUG_ON(item_size != sizeof(*ei0));
+                ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
+                printk(KERN_INFO "\t\textent refs %u\n",
+                       btrfs_extent_refs_v0(eb, ei0));
+                return;
+#else
+                BUG();
+#endif
+        }
+        ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+        flags = btrfs_extent_flags(eb, ei);
+        printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
+               (unsigned long long)btrfs_extent_refs(eb, ei),
+               (unsigned long long)btrfs_extent_generation(eb, ei),
+               (unsigned long long)flags);
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                struct btrfs_tree_block_info *info;
+                info = (struct btrfs_tree_block_info *)(ei + 1);
+                btrfs_tree_block_key(eb, info, &key);
+                printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+                       "level %d\n",
+                       (unsigned long long)btrfs_disk_key_objectid(&key),
+                       key.type,
+                       (unsigned long long)btrfs_disk_key_offset(&key),
+                       btrfs_tree_block_level(eb, info));
+                iref = (struct btrfs_extent_inline_ref *)(info + 1);
+        } else {
+                iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+        }
+        ptr = (unsigned long)iref;
+        end = (unsigned long)ei + item_size;
+        while (ptr < end) {
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                type = btrfs_extent_inline_ref_type(eb, iref);
+                offset = btrfs_extent_inline_ref_offset(eb, iref);
+                switch (type) {
+                case BTRFS_TREE_BLOCK_REF_KEY:
+                        printk(KERN_INFO "\t\ttree block backref "
+                                "root %llu\n", (unsigned long long)offset);
+                        break;
+                case BTRFS_SHARED_BLOCK_REF_KEY:
+                        printk(KERN_INFO "\t\tshared block backref "
+                                "parent %llu\n", (unsigned long long)offset);
+                        break;
+                case BTRFS_EXTENT_DATA_REF_KEY:
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        print_extent_data_ref(eb, dref);
+                        break;
+                case BTRFS_SHARED_DATA_REF_KEY:
+                        sref = (struct btrfs_shared_data_ref *)(iref + 1);
+                        printk(KERN_INFO "\t\tshared data backref "
+                               "parent %llu count %u\n",
+                               (unsigned long long)offset,
+                               btrfs_shared_data_ref_count(eb, sref));
+                        break;
+                default:
+                        BUG();
+                }
+                ptr += btrfs_extent_inline_ref_size(type);
+        }
+        WARN_ON(ptr > end);
+}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
+{
+        struct btrfs_extent_ref_v0 *ref0;
+        ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
+        printk("\t\textent back ref root %llu gen %llu "
+                "owner %llu num_refs %lu\n",
+                (unsigned long long)btrfs_ref_root_v0(eb, ref0),
+                (unsigned long long)btrfs_ref_generation_v0(eb, ref0),
+                (unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+                (unsigned long)btrfs_ref_count_v0(eb, ref0));
+}
+#endif
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
        int i;
+        u32 type;
        u32 nr = btrfs_header_nritems(l);
        struct btrfs_item *item;
-        struct btrfs_extent_item *ei;
        struct btrfs_root_item *ri;
        struct btrfs_dir_item *di;
        struct btrfs_inode_item *ii;
        struct btrfs_block_group_item *bi;
        struct btrfs_file_extent_item *fi;
+        struct btrfs_extent_data_ref *dref;
+        struct btrfs_shared_data_ref *sref;
+        struct btrfs_dev_extent *dev_extent;
        struct btrfs_key key;
        struct btrfs_key found_key;
-        struct btrfs_extent_ref *ref;
-        struct btrfs_dev_extent *dev_extent;
-        u32 type;
        printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
                (unsigned long long)btrfs_header_bytenr(l), nr,
@@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                                btrfs_disk_root_refs(l, ri));
                        break;
                case BTRFS_EXTENT_ITEM_KEY:
-                        ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
+                        print_extent_item(l, i);
-                        printk(KERN_INFO "\t\textent data refs %u\n",
+                        break;
-                                btrfs_extent_refs(l, ei));
+                case BTRFS_TREE_BLOCK_REF_KEY:
-                        break;
+                        printk(KERN_INFO "\t\ttree block backref\n");
-                case BTRFS_EXTENT_REF_KEY:
+                        break;
-                        ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
+                case BTRFS_SHARED_BLOCK_REF_KEY:
-                        printk(KERN_INFO "\t\textent back ref root %llu "
+                        printk(KERN_INFO "\t\tshared block backref\n");
-                               "gen %llu owner %llu num_refs %lu\n",
+                        break;
-                               (unsigned long long)btrfs_ref_root(l, ref),
+                case BTRFS_EXTENT_DATA_REF_KEY:
-                               (unsigned long long)btrfs_ref_generation(l, ref),
+                        dref = btrfs_item_ptr(l, i,
-                               (unsigned long long)btrfs_ref_objectid(l, ref),
+                                              struct btrfs_extent_data_ref);
-                               (unsigned long)btrfs_ref_num_refs(l, ref));
+                        print_extent_data_ref(l, dref);
+                        break;
+                case BTRFS_SHARED_DATA_REF_KEY:
+                        sref = btrfs_item_ptr(l, i,
+                                              struct btrfs_shared_data_ref);
+                        printk(KERN_INFO "\t\tshared data backref count %u\n",
+                               btrfs_shared_data_ref_count(l, sref));
                        break;
                case BTRFS_EXTENT_DATA_KEY:
                        fi = btrfs_item_ptr(l, i,
                                            struct btrfs_file_extent_item);
@@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                               (unsigned long long)
                               btrfs_file_extent_ram_bytes(l, fi));
                        break;
+                case BTRFS_EXTENT_REF_V0_KEY:
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        print_extent_ref_v0(l, i);
+#else
+                        BUG();
+#endif
                case BTRFS_BLOCK_GROUP_ITEM_KEY:
                        bi = btrfs_item_ptr(l, i,
                                            struct btrfs_block_group_item);
@@ -188,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
        }
        printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
               (unsigned long long)btrfs_header_bytenr(c),
-               btrfs_header_level(c), nr,
+              level, nr,
               (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
        for (i = 0; i < nr; i++) {
                btrfs_node_key_to_cpu(c, &key, i);
@@ -205,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
                                        btrfs_level_size(root, level - 1),
                                        btrfs_node_ptr_generation(c, i));
                if (btrfs_is_leaf(next) &&
-                    btrfs_header_level(c) != 1)
+                   level != 1)
                        BUG();
                if (btrfs_header_level(next) !=
-                        btrfs_header_level(c) - 1)
+                       level - 1)
                        BUG();
                btrfs_print_tree(root, next);
                free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 000000000000..c04f7f212602
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3716 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+        struct rb_node rb_node;
+        u64 bytenr;
+};
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+        struct rb_node rb_node;
+        u64 bytenr;
+        /* objectid tree block owner */
+        u64 owner;
+        /* list of upper level blocks reference this block */
+        struct list_head upper;
+        /* list of child blocks in the cache */
+        struct list_head lower;
+        /* NULL if this node is not tree root */
+        struct btrfs_root *root;
+        /* extent buffer got by COW the block */
+        struct extent_buffer *eb;
+        /* level of tree block */
+        unsigned int level:8;
+        /* 1 if the block is root of old snapshot */
+        unsigned int old_root:1;
+        /* 1 if no child blocks in the cache */
+        unsigned int lowest:1;
+        /* is the extent buffer locked */
+        unsigned int locked:1;
+        /* has the block been processed */
+        unsigned int processed:1;
+        /* have backrefs of this block been checked */
+        unsigned int checked:1;
+};
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+        struct list_head list[2];
+        struct backref_node *node[2];
+        u64 blockptr;
+};
+#define LOWER   0
+#define UPPER   1
+struct backref_cache {
+        /* red black tree of all backref nodes in the cache */
+        struct rb_root rb_root;
+        /* list of backref nodes with no child block in the cache */
+        struct list_head pending[BTRFS_MAX_LEVEL];
+        spinlock_t lock;
+};
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+        struct rb_node rb_node;
+        u64 bytenr;
+        void *data;
+};
+struct mapping_tree {
+        struct rb_root rb_root;
+        spinlock_t lock;
+};
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+        struct rb_node rb_node;
+        u64 bytenr;
+        struct btrfs_key key;
+        unsigned int level:8;
+        unsigned int key_ready:1;
+};
+/* inode vector */
+#define INODEVEC_SIZE 16
+struct inodevec {
+        struct list_head list;
+        struct inode *inode[INODEVEC_SIZE];
+        int nr;
+};
+struct reloc_control {
+        /* block group to relocate */
+        struct btrfs_block_group_cache *block_group;
+        /* extent tree */
+        struct btrfs_root *extent_root;
+        /* inode for moving data */
+        struct inode *data_inode;
+        struct btrfs_workers workers;
+        /* tree blocks have been processed */
+        struct extent_io_tree processed_blocks;
+        /* map start of tree root to corresponding reloc tree */
+        struct mapping_tree reloc_root_tree;
+        /* list of reloc trees */
+        struct list_head reloc_roots;
+        u64 search_start;
+        u64 extents_found;
+        u64 extents_skipped;
+        int stage;
+        int create_reloc_root;
+        unsigned int found_file_extent:1;
+        unsigned int found_old_snapshot:1;
+};
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS       0
+#define UPDATE_DATA_PTRS        1
+/*
+ * merge reloc tree to corresponding fs tree in worker threads
+ */
+struct async_merge {
+        struct btrfs_work work;
+        struct reloc_control *rc;
+        struct btrfs_root *root;
+        struct completion *done;
+        atomic_t *num_pending;
+};
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+        tree->rb_root.rb_node = NULL;
+        spin_lock_init(&tree->lock);
+}
+static void backref_cache_init(struct backref_cache *cache)
+{
+        int i;
+        cache->rb_root.rb_node = NULL;
+        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+                INIT_LIST_HEAD(&cache->pending[i]);
+        spin_lock_init(&cache->lock);
+}
+static void backref_node_init(struct backref_node *node)
+{
+        memset(node, 0, sizeof(*node));
+        INIT_LIST_HEAD(&node->upper);
+        INIT_LIST_HEAD(&node->lower);
+        RB_CLEAR_NODE(&node->rb_node);
+}
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct tree_entry *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct tree_entry, rb_node);
+                if (bytenr < entry->bytenr)
+                        p = &(*p)->rb_left;
+                else if (bytenr > entry->bytenr)
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+        struct rb_node *n = root->rb_node;
+        struct tree_entry *entry;
+        while (n) {
+                entry = rb_entry(n, struct tree_entry, rb_node);
+                if (bytenr < entry->bytenr)
+                        n = n->rb_left;
+                else if (bytenr > entry->bytenr)
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        return NULL;
+}
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+                                            struct backref_edge *edges[],
+                                            int *index)
+{
+        struct backref_edge *edge;
+        int idx = *index;
+        while (!list_empty(&node->upper)) {
+                edge = list_entry(node->upper.next,
+                                  struct backref_edge, list[LOWER]);
+                edges[idx++] = edge;
+                node = edge->node[UPPER];
+        }
+        *index = idx;
+        return node;
+}
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+                                              int *index)
+{
+        struct backref_edge *edge;
+        struct backref_node *lower;
+        int idx = *index;
+        while (idx > 0) {
+                edge = edges[idx - 1];
+                lower = edge->node[LOWER];
+                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+                        idx--;
+                        continue;
+                }
+                edge = list_entry(edge->list[LOWER].next,
+                                  struct backref_edge, list[LOWER]);
+                edges[idx - 1] = edge;
+                *index = idx;
+                return edge->node[UPPER];
+        }
+        *index = 0;
+        return NULL;
+}
+static void drop_node_buffer(struct backref_node *node)
+{
+        if (node->eb) {
+                if (node->locked) {
+                        btrfs_tree_unlock(node->eb);
+                        node->locked = 0;
+                }
+                free_extent_buffer(node->eb);
+                node->eb = NULL;
+        }
+}
+static void drop_backref_node(struct backref_cache *tree,
+                              struct backref_node *node)
+{
+        BUG_ON(!node->lowest);
+        BUG_ON(!list_empty(&node->upper));
+        drop_node_buffer(node);
+        list_del(&node->lower);
+        rb_erase(&node->rb_node, &tree->rb_root);
+        kfree(node);
+}
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+                                struct backref_node *node)
+{
+        struct backref_node *upper;
+        struct backref_edge *edge;
+        if (!node)
+                return;
+        BUG_ON(!node->lowest);
+        while (!list_empty(&node->upper)) {
+                edge = list_entry(node->upper.next, struct backref_edge,
+                                  list[LOWER]);
+                upper = edge->node[UPPER];
+                list_del(&edge->list[LOWER]);
+                list_del(&edge->list[UPPER]);
+                kfree(edge);
+                /*
+                 * add the node to pending list if no other
+                 * child block cached.
+                 */
+                if (list_empty(&upper->lower)) {
+                        list_add_tail(&upper->lower,
+                                      &cache->pending[upper->level]);
+                        upper->lowest = 1;
+                }
+        }
+        drop_backref_node(cache, node);
+}
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+                                          u64 bytenr)
+{
+        struct rb_node *rb_node;
+        struct mapping_node *node;
+        struct btrfs_root *root = NULL;
+        spin_lock(&rc->reloc_root_tree.lock);
+        rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct mapping_node, rb_node);
+                root = (struct btrfs_root *)node->data;
+        }
+        spin_unlock(&rc->reloc_root_tree.lock);
+        return root;
+}
+static int is_cowonly_root(u64 root_objectid)
+{
+        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+                return 1;
+        return 0;
+}
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_objectid)
+{
+        struct btrfs_key key;
+        key.objectid = root_objectid;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        if (is_cowonly_root(root_objectid))
+                key.offset = 0;
+        else
+                key.offset = (u64)-1;
+        return btrfs_read_fs_root_no_name(fs_info, &key);
+}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+                                  struct extent_buffer *leaf,
+                                  struct btrfs_extent_ref_v0 *ref0)
+{
+        struct btrfs_root *root;
+        u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+        u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+        BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+        root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+        BUG_ON(IS_ERR(root));
+        if (root->ref_cows &&
+            generation != btrfs_root_generation(&root->root_item))
+                return NULL;
+        return root;
+}
+#endif
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+                        unsigned long *ptr, unsigned long *end)
+{
+        struct btrfs_extent_item *ei;
+        struct btrfs_tree_block_info *bi;
+        u32 item_size;
+        item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+                return 1;
+        }
+#endif
+        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+        WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+                  BTRFS_EXTENT_FLAG_TREE_BLOCK));
+        if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+                WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+                return 1;
+        }
+        bi = (struct btrfs_tree_block_info *)(ei + 1);
+        *ptr = (unsigned long)(bi + 1);
+        *end = (unsigned long)ei + item_size;
+        return 0;
+}
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static struct backref_node *build_backref_tree(struct reloc_control *rc,
+                                               struct backref_cache *cache,
+                                               struct btrfs_key *node_key,
+                                               int level, u64 bytenr)
+{
+        struct btrfs_path *path1;
+        struct btrfs_path *path2;
+        struct extent_buffer *eb;
+        struct btrfs_root *root;
+        struct backref_node *cur;
+        struct backref_node *upper;
+        struct backref_node *lower;
+        struct backref_node *node = NULL;
+        struct backref_node *exist = NULL;
+        struct backref_edge *edge;
+        struct rb_node *rb_node;
+        struct btrfs_key key;
+        unsigned long end;
+        unsigned long ptr;
+        LIST_HEAD(list);
+        int ret;
+        int err = 0;
+        path1 = btrfs_alloc_path();
+        path2 = btrfs_alloc_path();
+        if (!path1 || !path2) {
+                err = -ENOMEM;
+                goto out;
+        }
+        node = kmalloc(sizeof(*node), GFP_NOFS);
+        if (!node) {
+                err = -ENOMEM;
+                goto out;
+        }
+        backref_node_init(node);
+        node->bytenr = bytenr;
+        node->owner = 0;
+        node->level = level;
+        node->lowest = 1;
+        cur = node;
+again:
+        end = 0;
+        ptr = 0;
+        key.objectid = cur->bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = (u64)-1;
+        path1->search_commit_root = 1;
+        path1->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+                                0, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        BUG_ON(!ret || !path1->slots[0]);
+        path1->slots[0]--;
+        WARN_ON(cur->checked);
+        if (!list_empty(&cur->upper)) {
+                /*
+                 * the backref was added previously when processsing
+                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
+                 */
+                BUG_ON(!list_is_singular(&cur->upper));
+                edge = list_entry(cur->upper.next, struct backref_edge,
+                                  list[LOWER]);
+                BUG_ON(!list_empty(&edge->list[UPPER]));
+                exist = edge->node[UPPER];
+                /*
+                 * add the upper level block to pending list if we need
+                 * check its backrefs
+                 */
+                if (!exist->checked)
+                        list_add_tail(&edge->list[UPPER], &list);
+        } else {
+                exist = NULL;
+        }
+        while (1) {
+                cond_resched();
+                eb = path1->nodes[0];
+                if (ptr >= end) {
+                        if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+                                ret = btrfs_next_leaf(rc->extent_root, path1);
+                                if (ret < 0) {
+                                        err = ret;
+                                        goto out;
+                                }
+                                if (ret > 0)
+                                        break;
+                                eb = path1->nodes[0];
+                        }
+                        btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+                        if (key.objectid != cur->bytenr) {
+                                WARN_ON(exist);
+                                break;
+                        }
+                        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+                                ret = find_inline_backref(eb, path1->slots[0],
+                                                          &ptr, &end);
+                                if (ret)
+                                        goto next;
+                        }
+                }
+                if (ptr < end) {
+                        /* update key for inline back ref */
+                        struct btrfs_extent_inline_ref *iref;
+                        iref = (struct btrfs_extent_inline_ref *)ptr;
+                        key.type = btrfs_extent_inline_ref_type(eb, iref);
+                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+                        WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+                                key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+                }
+                if (exist &&
+                    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+                      exist->owner == key.offset) ||
+                     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+                      exist->bytenr == key.offset))) {
+                        exist = NULL;
+                        goto next;
+                }
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                        if (key.objectid == key.offset &&
+                            key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                                struct btrfs_extent_ref_v0 *ref0;
+                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
+                                                struct btrfs_extent_ref_v0);
+                                root = find_tree_root(rc, eb, ref0);
+                                if (root)
+                                        cur->root = root;
+                                else
+                                        cur->old_root = 1;
+                                break;
+                        }
+#else
+                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+                        if (key.objectid == key.offset) {
+                                /*
+                                 * only root blocks of reloc trees use
+                                 * backref of this type.
+                                 */
+                                root = find_reloc_root(rc, cur->bytenr);
+                                BUG_ON(!root);
+                                cur->root = root;
+                                break;
+                        }
+                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        if (!edge) {
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        rb_node = tree_search(&cache->rb_root, key.offset);
+                        if (!rb_node) {
+                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                if (!upper) {
+                                        kfree(edge);
+                                        err = -ENOMEM;
+                                        goto out;
+                                }
+                                backref_node_init(upper);
+                                upper->bytenr = key.offset;
+                                upper->owner = 0;
+                                upper->level = cur->level + 1;
+                                /*
+                                 *  backrefs for the upper level block isn't
+                                 *  cached, add the block to pending list
+                                 */
+                                list_add_tail(&edge->list[UPPER], &list);
+                        } else {
+                                upper = rb_entry(rb_node, struct backref_node,
+                                                 rb_node);
+                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                        }
+                        list_add(&edge->list[LOWER], &cur->upper);
+                        edge->node[UPPER] = upper;
+                        edge->node[LOWER] = cur;
+                        goto next;
+                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+                        goto next;
+                }
+                /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+                root = read_fs_root(rc->extent_root->fs_info, key.offset);
+                if (IS_ERR(root)) {
+                        err = PTR_ERR(root);
+                        goto out;
+                }
+                if (btrfs_root_level(&root->root_item) == cur->level) {
+                        /* tree root */
+                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+                               cur->bytenr);
+                        cur->root = root;
+                        break;
+                }
+                level = cur->level + 1;
+                /*
+                 * searching the tree to find upper level blocks
+                 * reference the block.
+                 */
+                path2->search_commit_root = 1;
+                path2->skip_locking = 1;
+                path2->lowest_level = level;
+                ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+                path2->lowest_level = 0;
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0 && path2->slots[level] > 0)
+                        path2->slots[level]--;
+                eb = path2->nodes[level];
+                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+                        cur->bytenr);
+                lower = cur;
+                for (; level < BTRFS_MAX_LEVEL; level++) {
+                        if (!path2->nodes[level]) {
+                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+                                       lower->bytenr);
+                                lower->root = root;
+                                break;
+                        }
+                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        if (!edge) {
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        eb = path2->nodes[level];
+                        rb_node = tree_search(&cache->rb_root, eb->start);
+                        if (!rb_node) {
+                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                if (!upper) {
+                                        kfree(edge);
+                                        err = -ENOMEM;
+                                        goto out;
+                                }
+                                backref_node_init(upper);
+                                upper->bytenr = eb->start;
+                                upper->owner = btrfs_header_owner(eb);
+                                upper->level = lower->level + 1;
+                                /*
+                                 * if we know the block isn't shared
+                                 * we can void checking its backrefs.
+                                 */
+                                if (btrfs_block_can_be_shared(root, eb))
+                                        upper->checked = 0;
+                                else
+                                        upper->checked = 1;
+                                /*
+                                 * add the block to pending list if we
+                                 * need check its backrefs. only block
+                                 * at 'cur->level + 1' is added to the
+                                 * tail of pending list. this guarantees
+                                 * we check backrefs from lower level
+                                 * blocks to upper level blocks.
+                                 */
+                                if (!upper->checked &&
+                                    level == cur->level + 1) {
+                                        list_add_tail(&edge->list[UPPER],
+                                                      &list);
+                                } else
+                                        INIT_LIST_HEAD(&edge->list[UPPER]);
+                        } else {
+                                upper = rb_entry(rb_node, struct backref_node,
+                                                 rb_node);
+                                BUG_ON(!upper->checked);
+                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                        }
+                        list_add_tail(&edge->list[LOWER], &lower->upper);
+                        edge->node[UPPER] = upper;
+                        edge->node[LOWER] = lower;
+                        if (rb_node)
+                                break;
+                        lower = upper;
+                        upper = NULL;
+                }
+                btrfs_release_path(root, path2);
+next:
+                if (ptr < end) {
+                        ptr += btrfs_extent_inline_ref_size(key.type);
+                        if (ptr >= end) {
+                                WARN_ON(ptr > end);
+                                ptr = 0;
+                                end = 0;
+                        }
+                }
+                if (ptr >= end)
+                        path1->slots[0]++;
+        }
+        btrfs_release_path(rc->extent_root, path1);
+        cur->checked = 1;
+        WARN_ON(exist);
+        /* the pending list isn't empty, take the first block to process */
+        if (!list_empty(&list)) {
+                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+                list_del_init(&edge->list[UPPER]);
+                cur = edge->node[UPPER];
+                goto again;
+        }
+        /*
+         * everything goes well, connect backref nodes and insert backref nodes
+         * into the cache.
+         */
+        BUG_ON(!node->checked);
+        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        BUG_ON(rb_node);
+        list_for_each_entry(edge, &node->upper, list[LOWER])
+                list_add_tail(&edge->list[UPPER], &list);
+        while (!list_empty(&list)) {
+                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+                list_del_init(&edge->list[UPPER]);
+                upper = edge->node[UPPER];
+                if (!RB_EMPTY_NODE(&upper->rb_node)) {
+                        if (upper->lowest) {
+                                list_del_init(&upper->lower);
+                                upper->lowest = 0;
+                        }
+                        list_add_tail(&edge->list[UPPER], &upper->lower);
+                        continue;
+                }
+                BUG_ON(!upper->checked);
+                rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                                      &upper->rb_node);
+                BUG_ON(rb_node);
+                list_add_tail(&edge->list[UPPER], &upper->lower);
+                list_for_each_entry(edge, &upper->upper, list[LOWER])
+                        list_add_tail(&edge->list[UPPER], &list);
+        }
+out:
+        btrfs_free_path(path1);
+        btrfs_free_path(path2);
+        if (err) {
+                INIT_LIST_HEAD(&list);
+                upper = node;
+                while (upper) {
+                        if (RB_EMPTY_NODE(&upper->rb_node)) {
+                                list_splice_tail(&upper->upper, &list);
+                                kfree(upper);
+                        }
+                        if (list_empty(&list))
+                                break;
+                        edge = list_entry(list.next, struct backref_edge,
+                                          list[LOWER]);
+                        upper = edge->node[UPPER];
+                        kfree(edge);
+                }
+                return ERR_PTR(err);
+        }
+        return node;
+}
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __add_reloc_root(struct btrfs_root *root)
+{
+        struct rb_node *rb_node;
+        struct mapping_node *node;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        node = kmalloc(sizeof(*node), GFP_NOFS);
+        BUG_ON(!node);
+        node->bytenr = root->node->start;
+        node->data = root;
+        spin_lock(&rc->reloc_root_tree.lock);
+        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+                              node->bytenr, &node->rb_node);
+        spin_unlock(&rc->reloc_root_tree.lock);
+        BUG_ON(rb_node);
+        list_add_tail(&root->root_list, &rc->reloc_roots);
+        return 0;
+}
+/*
+ * helper to update/delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, int del)
+{
+        struct rb_node *rb_node;
+        struct mapping_node *node = NULL;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        spin_lock(&rc->reloc_root_tree.lock);
+        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+                              root->commit_root->start);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct mapping_node, rb_node);
+                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+        }
+        spin_unlock(&rc->reloc_root_tree.lock);
+        BUG_ON((struct btrfs_root *)node->data != root);
+        if (!del) {
+                spin_lock(&rc->reloc_root_tree.lock);
+                node->bytenr = root->node->start;
+                rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+                                      node->bytenr, &node->rb_node);
+                spin_unlock(&rc->reloc_root_tree.lock);
+                BUG_ON(rb_node);
+        } else {
+                list_del_init(&root->root_list);
+                kfree(node);
+        }
+        return 0;
+}
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct extent_buffer *eb;
+        struct btrfs_root_item *root_item;
+        struct btrfs_key root_key;
+        int ret;
+        if (root->reloc_root) {
+                reloc_root = root->reloc_root;
+                reloc_root->last_trans = trans->transid;
+                return 0;
+        }
+        if (!root->fs_info->reloc_ctl ||
+            !root->fs_info->reloc_ctl->create_reloc_root ||
+            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                return 0;
+        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+        BUG_ON(!root_item);
+        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = root->root_key.objectid;
+        ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+                              BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(ret);
+        btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
+        memcpy(root_item, &root->root_item, sizeof(*root_item));
+        btrfs_set_root_refs(root_item, 1);
+        btrfs_set_root_bytenr(root_item, eb->start);
+        btrfs_set_root_level(root_item, btrfs_header_level(eb));
+        btrfs_set_root_generation(root_item, trans->transid);
+        memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
+        root_item->drop_level = 0;
+        btrfs_tree_unlock(eb);
+        free_extent_buffer(eb);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+                                &root_key, root_item);
+        BUG_ON(ret);
+        kfree(root_item);
+        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+                                                 &root_key);
+        BUG_ON(IS_ERR(reloc_root));
+        reloc_root->last_trans = trans->transid;
+        __add_reloc_root(reloc_root);
+        root->reloc_root = reloc_root;
+        return 0;
+}
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct btrfs_root_item *root_item;
+        int del = 0;
+        int ret;
+        if (!root->reloc_root)
+                return 0;
+        reloc_root = root->reloc_root;
+        root_item = &reloc_root->root_item;
+        if (btrfs_root_refs(root_item) == 0) {
+                root->reloc_root = NULL;
+                del = 1;
+        }
+        __update_reloc_root(reloc_root, del);
+        if (reloc_root->commit_root != reloc_root->node) {
+                btrfs_set_root_node(root_item, reloc_root->node);
+                free_extent_buffer(reloc_root->commit_root);
+                reloc_root->commit_root = btrfs_root_node(reloc_root);
+        }
+        ret = btrfs_update_root(trans, root->fs_info->tree_root,
+                                &reloc_root->root_key, root_item);
+        BUG_ON(ret);
+        return 0;
+}
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+        struct rb_node *node;
+        struct rb_node *prev;
+        struct btrfs_inode *entry;
+        struct inode *inode;
+        spin_lock(&root->inode_lock);
+again:
+        node = root->inode_tree.rb_node;
+        prev = NULL;
+        while (node) {
+                prev = node;
+                entry = rb_entry(node, struct btrfs_inode, rb_node);
+                if (objectid < entry->vfs_inode.i_ino)
+                        node = node->rb_left;
+                else if (objectid > entry->vfs_inode.i_ino)
+                        node = node->rb_right;
+                else
+                        break;
+        }
+        if (!node) {
+                while (prev) {
+                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
+                        if (objectid <= entry->vfs_inode.i_ino) {
+                                node = prev;
+                                break;
+                        }
+                        prev = rb_next(prev);
+                }
+        }
+        while (node) {
+                entry = rb_entry(node, struct btrfs_inode, rb_node);
+                inode = igrab(&entry->vfs_inode);
+                if (inode) {
+                        spin_unlock(&root->inode_lock);
+                        return inode;
+                }
+                objectid = entry->vfs_inode.i_ino + 1;
+                if (cond_resched_lock(&root->inode_lock))
+                        goto again;
+                node = rb_next(node);
+        }
+        spin_unlock(&root->inode_lock);
+        return NULL;
+}
+static int in_block_group(u64 bytenr,
+                          struct btrfs_block_group_cache *block_group)
+{
+        if (bytenr >= block_group->key.objectid &&
+            bytenr < block_group->key.objectid + block_group->key.offset)
+                return 1;
+        return 0;
+}
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+                            u64 bytenr, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+                                       bytenr, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+               btrfs_file_extent_compression(leaf, fi) ||
+               btrfs_file_extent_encryption(leaf, fi) ||
+               btrfs_file_extent_other_encoding(leaf, fi));
+        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+                ret = 1;
+                goto out;
+        }
+        if (new_bytenr)
+                *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static int replace_file_extents(struct btrfs_trans_handle *trans,
+                                struct reloc_control *rc,
+                                struct btrfs_root *root,
+                                struct extent_buffer *leaf,
+                                struct list_head *inode_list)
+{
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        struct inode *inode = NULL;
+        struct inodevec *ivec = NULL;
+        u64 parent;
+        u64 bytenr;
+        u64 new_bytenr;
+        u64 num_bytes;
+        u64 end;
+        u32 nritems;
+        u32 i;
+        int ret;
+        int first = 1;
+        int dirty = 0;
+        if (rc->stage != UPDATE_DATA_PTRS)
+                return 0;
+        /* reloc trees always use full backref */
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                parent = leaf->start;
+        else
+                parent = 0;
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                btrfs_item_key_to_cpu(leaf, &key, i);
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+                if (bytenr == 0)
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group))
+                        continue;
+                /*
+                 * if we are modifying block in fs tree, wait for readpage
+                 * to complete and drop the extent cache
+                 */
+                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+                        if (!ivec || ivec->nr == INODEVEC_SIZE) {
+                                ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
+                                BUG_ON(!ivec);
+                                ivec->nr = 0;
+                                list_add_tail(&ivec->list, inode_list);
+                        }
+                        if (first) {
+                                inode = find_next_inode(root, key.objectid);
+                                if (inode)
+                                        ivec->inode[ivec->nr++] = inode;
+                                first = 0;
+                        } else if (inode && inode->i_ino < key.objectid) {
+                                inode = find_next_inode(root, key.objectid);
+                                if (inode)
+                                        ivec->inode[ivec->nr++] = inode;
+                        }
+                        if (inode && inode->i_ino == key.objectid) {
+                                end = key.offset +
+                                      btrfs_file_extent_num_bytes(leaf, fi);
+                                WARN_ON(!IS_ALIGNED(key.offset,
+                                                    root->sectorsize));
+                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+                                end--;
+                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                                                      key.offset, end,
+                                                      GFP_NOFS);
+                                if (!ret)
+                                        continue;
+                                btrfs_drop_extent_cache(inode, key.offset, end,
+                                                        1);
+                                unlock_extent(&BTRFS_I(inode)->io_tree,
+                                              key.offset, end, GFP_NOFS);
+                        }
+                }
+                ret = get_new_location(rc->data_inode, &new_bytenr,
+                                       bytenr, num_bytes);
+                if (ret > 0)
+                        continue;
+                BUG_ON(ret < 0);
+                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+                dirty = 1;
+                key.offset -= btrfs_file_extent_offset(leaf, fi);
+                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+                                           num_bytes, parent,
+                                           btrfs_header_owner(leaf),
+                                           key.objectid, key.offset);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                        parent, btrfs_header_owner(leaf),
+                                        key.objectid, key.offset);
+                BUG_ON(ret);
+        }
+        if (dirty)
+                btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+                     struct btrfs_path *path, int level)
+{
+        struct btrfs_disk_key key1;
+        struct btrfs_disk_key key2;
+        btrfs_node_key(eb, &key1, slot);
+        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+        return memcmp(&key1, &key2, sizeof(key1));
+}
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static int replace_path(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *dest, struct btrfs_root *src,
+                        struct btrfs_path *path, struct btrfs_key *next_key,
+                        struct extent_buffer **leaf,
+                        int lowest_level, int max_level)
+{
+        struct extent_buffer *eb;
+        struct extent_buffer *parent;
+        struct btrfs_key key;
+        u64 old_bytenr;
+        u64 new_bytenr;
+        u64 old_ptr_gen;
+        u64 new_ptr_gen;
+        u64 last_snapshot;
+        u32 blocksize;
+        int level;
+        int ret;
+        int slot;
+        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(lowest_level > 1 && leaf);
+        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+        slot = path->slots[lowest_level];
+        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+        eb = btrfs_lock_root_node(dest);
+        btrfs_set_lock_blocking(eb);
+        level = btrfs_header_level(eb);
+        if (level < lowest_level) {
+                btrfs_tree_unlock(eb);
+                free_extent_buffer(eb);
+                return 0;
+        }
+        ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+        BUG_ON(ret);
+        btrfs_set_lock_blocking(eb);
+        if (next_key) {
+                next_key->objectid = (u64)-1;
+                next_key->type = (u8)-1;
+                next_key->offset = (u64)-1;
+        }
+        parent = eb;
+        while (1) {
+                level = btrfs_header_level(parent);
+                BUG_ON(level < lowest_level);
+                ret = btrfs_bin_search(parent, &key, level, &slot);
+                if (ret && slot > 0)
+                        slot--;
+                if (next_key && slot + 1 < btrfs_header_nritems(parent))
+                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+                old_bytenr = btrfs_node_blockptr(parent, slot);
+                blocksize = btrfs_level_size(dest, level - 1);
+                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+                if (level <= max_level) {
+                        eb = path->nodes[level];
+                        new_bytenr = btrfs_node_blockptr(eb,
+                                                        path->slots[level]);
+                        new_ptr_gen = btrfs_node_ptr_generation(eb,
+                                                        path->slots[level]);
+                } else {
+                        new_bytenr = 0;
+                        new_ptr_gen = 0;
+                }
+                if (new_bytenr > 0 && new_bytenr == old_bytenr) {
+                        WARN_ON(1);
+                        ret = level;
+                        break;
+                }
+                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+                    memcmp_node_keys(parent, slot, path, level)) {
+                        if (level <= lowest_level && !leaf) {
+                                ret = 0;
+                                break;
+                        }
+                        eb = read_tree_block(dest, old_bytenr, blocksize,
+                                             old_ptr_gen);
+                        btrfs_tree_lock(eb);
+                        ret = btrfs_cow_block(trans, dest, eb, parent,
+                                              slot, &eb);
+                        BUG_ON(ret);
+                        btrfs_set_lock_blocking(eb);
+                        if (level <= lowest_level) {
+                                *leaf = eb;
+                                ret = 0;
+                                break;
+                        }
+                        btrfs_tree_unlock(parent);
+                        free_extent_buffer(parent);
+                        parent = eb;
+                        continue;
+                }
+                btrfs_node_key_to_cpu(path->nodes[level], &key,
+                                      path->slots[level]);
+                btrfs_release_path(src, path);
+                path->lowest_level = level;
+                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+                path->lowest_level = 0;
+                BUG_ON(ret);
+                /*
+                 * swap blocks in fs tree and reloc tree.
+                 */
+                btrfs_set_node_blockptr(parent, slot, new_bytenr);
+                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+                btrfs_mark_buffer_dirty(parent);
+                btrfs_set_node_blockptr(path->nodes[level],
+                                        path->slots[level], old_bytenr);
+                btrfs_set_node_ptr_generation(path->nodes[level],
+                                              path->slots[level], old_ptr_gen);
+                btrfs_mark_buffer_dirty(path->nodes[level]);
+                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
+                                        path->nodes[level]->start,
+                                        src->root_key.objectid, level - 1, 0);
+                BUG_ON(ret);
+                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
+                                        0, dest->root_key.objectid, level - 1,
+                                        0);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+                                        path->nodes[level]->start,
+                                        src->root_key.objectid, level - 1, 0);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+                                        0, dest->root_key.objectid, level - 1,
+                                        0);
+                BUG_ON(ret);
+                btrfs_unlock_up_safe(path, 0);
+                ret = level;
+                break;
+        }
+        btrfs_tree_unlock(parent);
+        free_extent_buffer(parent);
+        return ret;
+}
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+                       int *level)
+{
+        struct extent_buffer *eb;
+        int i;
+        u64 last_snapshot;
+        u32 nritems;
+        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        for (i = 0; i < *level; i++) {
+                free_extent_buffer(path->nodes[i]);
+                path->nodes[i] = NULL;
+        }
+        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+                eb = path->nodes[i];
+                nritems = btrfs_header_nritems(eb);
+                while (path->slots[i] + 1 < nritems) {
+                        path->slots[i]++;
+                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+                            last_snapshot)
+                                continue;
+                        *level = i;
+                        return 0;
+                }
+                free_extent_buffer(path->nodes[i]);
+                path->nodes[i] = NULL;
+        }
+        return 1;
+}
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+                         int *level)
+{
+        struct extent_buffer *eb = NULL;
+        int i;
+        u64 bytenr;
+        u64 ptr_gen = 0;
+        u64 last_snapshot;
+        u32 blocksize;
+        u32 nritems;
+        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        for (i = *level; i > 0; i--) {
+                eb = path->nodes[i];
+                nritems = btrfs_header_nritems(eb);
+                while (path->slots[i] < nritems) {
+                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+                        if (ptr_gen > last_snapshot)
+                                break;
+                        path->slots[i]++;
+                }
+                if (path->slots[i] >= nritems) {
+                        if (i == *level)
+                                break;
+                        *level = i + 1;
+                        return 0;
+                }
+                if (i == 1) {
+                        *level = i;
+                        return 0;
+                }
+                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
+                blocksize = btrfs_level_size(root, i - 1);
+                eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+                BUG_ON(btrfs_header_level(eb) != i - 1);
+                path->nodes[i - 1] = eb;
+                path->slots[i - 1] = 0;
+        }
+        return 1;
+}
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+                                   struct btrfs_key *min_key,
+                                   struct btrfs_key *max_key)
+{
+        struct inode *inode = NULL;
+        u64 objectid;
+        u64 start, end;
+        objectid = min_key->objectid;
+        while (1) {
+                cond_resched();
+                iput(inode);
+                if (objectid > max_key->objectid)
+                        break;
+                inode = find_next_inode(root, objectid);
+                if (!inode)
+                        break;
+                if (inode->i_ino > max_key->objectid) {
+                        iput(inode);
+                        break;
+                }
+                objectid = inode->i_ino + 1;
+                if (!S_ISREG(inode->i_mode))
+                        continue;
+                if (unlikely(min_key->objectid == inode->i_ino)) {
+                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+                                start = 0;
+                        else {
+                                start = min_key->offset;
+                                WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+                        }
+                } else {
+                        start = 0;
+                }
+                if (unlikely(max_key->objectid == inode->i_ino)) {
+                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+                                end = (u64)-1;
+                        } else {
+                                if (max_key->offset == 0)
+                                        continue;
+                                end = max_key->offset;
+                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+                                end--;
+                        }
+                } else {
+                        end = (u64)-1;
+                }
+                /* the lock_extent waits for readpage to complete */
+                lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                btrfs_drop_extent_cache(inode, start, end, 1);
+                unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+        }
+        return 0;
+}
+static int find_next_key(struct btrfs_path *path, int level,
+                         struct btrfs_key *key)
+{
+        while (level < BTRFS_MAX_LEVEL) {
+                if (!path->nodes[level])
+                        break;
+                if (path->slots[level] + 1 <
+                    btrfs_header_nritems(path->nodes[level])) {
+                        btrfs_node_key_to_cpu(path->nodes[level], key,
+                                              path->slots[level] + 1);
+                        return 0;
+                }
+                level++;
+        }
+        return 1;
+}
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+                                               struct btrfs_root *root)
+{
+        LIST_HEAD(inode_list);
+        struct btrfs_key key;
+        struct btrfs_key next_key;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root_item *root_item;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf = NULL;
+        unsigned long nr;
+        int level;
+        int max_level;
+        int replaced = 0;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        reloc_root = root->reloc_root;
+        root_item = &reloc_root->root_item;
+        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+                level = btrfs_root_level(root_item);
+                extent_buffer_get(reloc_root->node);
+                path->nodes[level] = reloc_root->node;
+                path->slots[level] = 0;
+        } else {
+                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+                level = root_item->drop_level;
+                BUG_ON(level == 0);
+                path->lowest_level = level;
+                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+                path->lowest_level = 0;
+                if (ret < 0) {
+                        btrfs_free_path(path);
+                        return ret;
+                }
+                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+                                      path->slots[level]);
+                WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+                btrfs_unlock_up_safe(path, 0);
+        }
+        if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+                trans = btrfs_start_transaction(root, 1);
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, 0);
+                btrfs_release_path(reloc_root, path);
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                leaf = path->nodes[0];
+                btrfs_unlock_up_safe(path, 1);
+                ret = replace_file_extents(trans, rc, root, leaf,
+                                           &inode_list);
+                if (ret < 0)
+                        err = ret;
+                goto out;
+        }
+        memset(&next_key, 0, sizeof(next_key));
+        while (1) {
+                leaf = NULL;
+                replaced = 0;
+                trans = btrfs_start_transaction(root, 1);
+                max_level = level;
+                ret = walk_down_reloc_tree(reloc_root, path, &level);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0)
+                        break;
+                if (!find_next_key(path, level, &key) &&
+                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+                        ret = 0;
+                } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
+                        ret = replace_path(trans, root, reloc_root,
+                                           path, &next_key, &leaf,
+                                           level, max_level);
+                } else {
+                        ret = replace_path(trans, root, reloc_root,
+                                           path, &next_key, NULL,
+                                           level, max_level);
+                }
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0) {
+                        level = ret;
+                        btrfs_node_key_to_cpu(path->nodes[level], &key,
+                                              path->slots[level]);
+                        replaced = 1;
+                } else if (leaf) {
+                        /*
+                         * no block got replaced, try replacing file extents
+                         */
+                        btrfs_item_key_to_cpu(leaf, &key, 0);
+                        ret = replace_file_extents(trans, rc, root, leaf,
+                                                   &inode_list);
+                        btrfs_tree_unlock(leaf);
+                        free_extent_buffer(leaf);
+                        BUG_ON(ret < 0);
+                }
+                ret = walk_up_reloc_tree(reloc_root, path, &level);
+                if (ret > 0)
+                        break;
+                BUG_ON(level == 0);
+                /*
+                 * save the merging progress in the drop_progress.
+                 * this is OK since root refs == 1 in this case.
+                 */
+                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+                               path->slots[level]);
+                root_item->drop_level = level;
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, root);
+                btrfs_btree_balance_dirty(root, nr);
+                if (replaced && rc->stage == UPDATE_DATA_PTRS)
+                        invalidate_extent_cache(root, &key, &next_key);
+        }
+        /*
+         * handle the case only one block in the fs tree need to be
+         * relocated and the block is tree root.
+         */
+        leaf = btrfs_lock_root_node(root);
+        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+        btrfs_tree_unlock(leaf);
+        free_extent_buffer(leaf);
+        if (ret < 0)
+                err = ret;
+out:
+        btrfs_free_path(path);
+        if (err == 0) {
+                memset(&root_item->drop_progress, 0,
+                       sizeof(root_item->drop_progress));
+                root_item->drop_level = 0;
+                btrfs_set_root_refs(root_item, 0);
+        }
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        /*
+         * put inodes while we aren't holding the tree locks
+         */
+        while (!list_empty(&inode_list)) {
+                struct inodevec *ivec;
+                ivec = list_entry(inode_list.next, struct inodevec, list);
+                list_del(&ivec->list);
+                while (ivec->nr > 0) {
+                        ivec->nr--;
+                        iput(ivec->inode[ivec->nr]);
+                }
+                kfree(ivec);
+        }
+        if (replaced && rc->stage == UPDATE_DATA_PTRS)
+                invalidate_extent_cache(root, &key, &next_key);
+        return err;
+}
+/*
+ * callback for the work threads.
+ * this function merges reloc tree with corresponding fs tree,
+ * and then drops the reloc tree.
+ */
+static void merge_func(struct btrfs_work *work)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        struct btrfs_root *reloc_root;
+        struct async_merge *async;
+        async = container_of(work, struct async_merge, work);
+        reloc_root = async->root;
+        if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+                root = read_fs_root(reloc_root->fs_info,
+                                    reloc_root->root_key.offset);
+                BUG_ON(IS_ERR(root));
+                BUG_ON(root->reloc_root != reloc_root);
+                merge_reloc_root(async->rc, root);
+                trans = btrfs_start_transaction(root, 1);
+                btrfs_update_reloc_root(trans, root);
+                btrfs_end_transaction(trans, root);
+        }
+        btrfs_drop_snapshot(reloc_root, 0);
+        if (atomic_dec_and_test(async->num_pending))
+                complete(async->done);
+        kfree(async);
+}
+static int merge_reloc_roots(struct reloc_control *rc)
+{
+        struct async_merge *async;
+        struct btrfs_root *root;
+        struct completion done;
+        atomic_t num_pending;
+        init_completion(&done);
+        atomic_set(&num_pending, 1);
+        while (!list_empty(&rc->reloc_roots)) {
+                root = list_entry(rc->reloc_roots.next,
+                                  struct btrfs_root, root_list);
+                list_del_init(&root->root_list);
+                async = kmalloc(sizeof(*async), GFP_NOFS);
+                BUG_ON(!async);
+                async->work.func = merge_func;
+                async->work.flags = 0;
+                async->rc = rc;
+                async->root = root;
+                async->done = &done;
+                async->num_pending = &num_pending;
+                atomic_inc(&num_pending);
+                btrfs_queue_worker(&rc->workers, &async->work);
+        }
+        if (!atomic_dec_and_test(&num_pending))
+                wait_for_completion(&done);
+        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+        return 0;
+}
+static void free_block_list(struct rb_root *blocks)
+{
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        while ((rb_node = rb_first(blocks))) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                rb_erase(rb_node, blocks);
+                kfree(block);
+        }
+}
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *reloc_root)
+{
+        struct btrfs_root *root;
+        if (reloc_root->last_trans == trans->transid)
+                return 0;
+        root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+        BUG_ON(IS_ERR(root));
+        BUG_ON(root->reloc_root != reloc_root);
+        return btrfs_record_root_in_trans(trans, root);
+}
+/*
+ * select one tree from trees that references the block.
+ * for blocks in refernce counted trees, we preper reloc tree.
+ * if no reloc tree found and reloc_only is true, NULL is returned.
+ */
+static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
+                                            struct backref_node *node,
+                                            struct backref_edge *edges[],
+                                            int *nr, int reloc_only)
+{
+        struct backref_node *next;
+        struct btrfs_root *root;
+        int index;
+        int loop = 0;
+again:
+        index = 0;
+        next = node;
+        while (1) {
+                cond_resched();
+                next = walk_up_backref(next, edges, &index);
+                root = next->root;
+                if (!root) {
+                        BUG_ON(!node->old_root);
+                        goto skip;
+                }
+                /* no other choice for non-refernce counted tree */
+                if (!root->ref_cows) {
+                        BUG_ON(reloc_only);
+                        break;
+                }
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                        record_reloc_root_in_trans(trans, root);
+                        break;
+                }
+                if (loop) {
+                        btrfs_record_root_in_trans(trans, root);
+                        break;
+                }
+                if (reloc_only || next != node) {
+                        if (!root->reloc_root)
+                                btrfs_record_root_in_trans(trans, root);
+                        root = root->reloc_root;
+                        /*
+                         * if the reloc tree was created in current
+                         * transation, there is no node in backref tree
+                         * corresponds to the root of the reloc tree.
+                         */
+                        if (btrfs_root_last_snapshot(&root->root_item) ==
+                            trans->transid - 1)
+                                break;
+                }
+skip:
+                root = NULL;
+                next = walk_down_backref(edges, &index);
+                if (!next || next->level <= node->level)
+                        break;
+        }
+        if (!root && !loop && !reloc_only) {
+                loop = 1;
+                goto again;
+        }
+        if (root)
+                *nr = index;
+        else
+                *nr = 0;
+        return root;
+}
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
+                                   struct backref_node *node)
+{
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        int nr;
+        return __select_one_root(trans, node, edges, &nr, 0);
+}
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+                                     struct backref_node *node,
+                                     struct backref_edge *edges[], int *nr)
+{
+        return __select_one_root(trans, node, edges, nr, 1);
+}
+static void grab_path_buffers(struct btrfs_path *path,
+                              struct backref_node *node,
+                              struct backref_edge *edges[], int nr)
+{
+        int i = 0;
+        while (1) {
+                drop_node_buffer(node);
+                node->eb = path->nodes[node->level];
+                BUG_ON(!node->eb);
+                if (path->locks[node->level])
+                        node->locked = 1;
+                path->nodes[node->level] = NULL;
+                path->locks[node->level] = 0;
+                if (i >= nr)
+                        break;
+                edges[i]->blockptr = node->eb->start;
+                node = edges[i]->node[UPPER];
+                i++;
+        }
+}
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+                         struct backref_node *node,
+                         struct btrfs_key *key,
+                         struct btrfs_path *path, int lowest)
+{
+        struct backref_node *upper;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        struct btrfs_root *root;
+        struct extent_buffer *eb;
+        u32 blocksize;
+        u64 bytenr;
+        u64 generation;
+        int nr;
+        int slot;
+        int ret;
+        int err = 0;
+        BUG_ON(lowest && node->eb);
+        path->lowest_level = node->level + 1;
+        list_for_each_entry(edge, &node->upper, list[LOWER]) {
+                cond_resched();
+                if (node->eb && node->eb->start == edge->blockptr)
+                        continue;
+                upper = edge->node[UPPER];
+                root = select_reloc_root(trans, upper, edges, &nr);
+                if (!root)
+                        continue;
+                if (upper->eb && !upper->locked)
+                        drop_node_buffer(upper);
+                if (!upper->eb) {
+                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        BUG_ON(ret > 0);
+                        slot = path->slots[upper->level];
+                        btrfs_unlock_up_safe(path, upper->level + 1);
+                        grab_path_buffers(path, upper, edges, nr);
+                        btrfs_release_path(NULL, path);
+                } else {
+                        ret = btrfs_bin_search(upper->eb, key, upper->level,
+                                               &slot);
+                        BUG_ON(ret);
+                }
+                bytenr = btrfs_node_blockptr(upper->eb, slot);
+                if (!lowest) {
+                        if (node->eb->start == bytenr) {
+                                btrfs_tree_unlock(upper->eb);
+                                upper->locked = 0;
+                                continue;
+                        }
+                } else {
+                        BUG_ON(node->bytenr != bytenr);
+                }
+                blocksize = btrfs_level_size(root, node->level);
+                generation = btrfs_node_ptr_generation(upper->eb, slot);
+                eb = read_tree_block(root, bytenr, blocksize, generation);
+                btrfs_tree_lock(eb);
+                btrfs_set_lock_blocking(eb);
+                if (!node->eb) {
+                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
+                                              slot, &eb);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        btrfs_set_lock_blocking(eb);
+                        node->eb = eb;
+                        node->locked = 1;
+                } else {
+                        btrfs_set_node_blockptr(upper->eb, slot,
+                                                node->eb->start);
+                        btrfs_set_node_ptr_generation(upper->eb, slot,
+                                                      trans->transid);
+                        btrfs_mark_buffer_dirty(upper->eb);
+                        ret = btrfs_inc_extent_ref(trans, root,
+                                                node->eb->start, blocksize,
+                                                upper->eb->start,
+                                                btrfs_header_owner(upper->eb),
+                                                node->level, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
+                        BUG_ON(ret);
+                }
+                if (!lowest) {
+                        btrfs_tree_unlock(upper->eb);
+                        upper->locked = 0;
+                }
+        }
+        path->lowest_level = 0;
+        return err;
+}
+static int link_to_upper(struct btrfs_trans_handle *trans,
+                         struct backref_node *node,
+                         struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        if (!node->eb || list_empty(&node->upper))
+                return 0;
+        btrfs_node_key_to_cpu(node->eb, &key, 0);
+        return do_relocation(trans, node, &key, path, 0);
+}
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+                                struct backref_cache *cache,
+                                struct btrfs_path *path)
+{
+        struct backref_node *node;
+        int level;
+        int ret;
+        int err = 0;
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                while (!list_empty(&cache->pending[level])) {
+                        node = list_entry(cache->pending[level].next,
+                                          struct backref_node, lower);
+                        BUG_ON(node->level != level);
+                        ret = link_to_upper(trans, node, path);
+                        if (ret < 0)
+                                err = ret;
+                        /*
+                         * this remove the node from the pending list and
+                         * may add some other nodes to the level + 1
+                         * pending list
+                         */
+                        remove_backref_node(cache, node);
+                }
+        }
+        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+        return err;
+}
+static void mark_block_processed(struct reloc_control *rc,
+                                 struct backref_node *node)
+{
+        u32 blocksize;
+        if (node->level == 0 ||
+            in_block_group(node->bytenr, rc->block_group)) {
+                blocksize = btrfs_level_size(rc->extent_root, node->level);
+                set_extent_bits(&rc->processed_blocks, node->bytenr,
+                                node->bytenr + blocksize - 1, EXTENT_DIRTY,
+                                GFP_NOFS);
+        }
+        node->processed = 1;
+}
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+                                    struct backref_node *node)
+{
+        struct backref_node *next = node;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        int index = 0;
+        while (next) {
+                cond_resched();
+                while (1) {
+                        if (next->processed)
+                                break;
+                        mark_block_processed(rc, next);
+                        if (list_empty(&next->upper))
+                                break;
+                        edge = list_entry(next->upper.next,
+                                          struct backref_edge, list[LOWER]);
+                        edges[index++] = edge;
+                        next = edge->node[UPPER];
+                }
+                next = walk_down_backref(edges, &index);
+        }
+}
+static int tree_block_processed(u64 bytenr, u32 blocksize,
+                                struct reloc_control *rc)
+{
+        if (test_range_bit(&rc->processed_blocks, bytenr,
+                           bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+                return 1;
+        return 0;
+}
+/*
+ * check if there are any file extent pointers in the leaf point to
+ * data require processing
+ */
+static int check_file_extents(struct reloc_control *rc,
+                              u64 bytenr, u32 blocksize, u64 ptr_gen)
+{
+        struct btrfs_key found_key;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        u32 nritems;
+        int i;
+        int ret = 0;
+        leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                btrfs_item_key_to_cpu(leaf, &found_key, i);
+                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                if (bytenr == 0)
+                        continue;
+                if (in_block_group(bytenr, rc->block_group)) {
+                        ret = 1;
+                        break;
+                }
+        }
+        free_extent_buffer(leaf);
+        return ret;
+}
+/*
+ * scan child blocks of a given block to find blocks require processing
+ */
+static int add_child_blocks(struct btrfs_trans_handle *trans,
+                            struct reloc_control *rc,
+                            struct backref_node *node,
+                            struct rb_root *blocks)
+{
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        u64 bytenr;
+        u64 ptr_gen;
+        u32 blocksize;
+        u32 nritems;
+        int i;
+        int err = 0;
+        nritems = btrfs_header_nritems(node->eb);
+        blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                bytenr = btrfs_node_blockptr(node->eb, i);
+                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+                if (ptr_gen == trans->transid)
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group) &&
+                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+                        continue;
+                if (tree_block_processed(bytenr, blocksize, rc))
+                        continue;
+                readahead_tree_block(rc->extent_root,
+                                     bytenr, blocksize, ptr_gen);
+        }
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                bytenr = btrfs_node_blockptr(node->eb, i);
+                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+                if (ptr_gen == trans->transid)
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group) &&
+                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+                        continue;
+                if (tree_block_processed(bytenr, blocksize, rc))
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group) &&
+                    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
+                        continue;
+                block = kmalloc(sizeof(*block), GFP_NOFS);
+                if (!block) {
+                        err = -ENOMEM;
+                        break;
+                }
+                block->bytenr = bytenr;
+                btrfs_node_key_to_cpu(node->eb, &block->key, i);
+                block->level = node->level - 1;
+                block->key_ready = 1;
+                rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+                BUG_ON(rb_node);
+        }
+        if (err)
+                free_block_list(blocks);
+        return err;
+}
+/*
+ * find adjacent blocks require processing
+ */
+static noinline_for_stack
+int add_adjacent_blocks(struct btrfs_trans_handle *trans,
+                        struct reloc_control *rc,
+                        struct backref_cache *cache,
+                        struct rb_root *blocks, int level,
+                        struct backref_node **upper)
+{
+        struct backref_node *node;
+        int ret = 0;
+        WARN_ON(!list_empty(&cache->pending[level]));
+        if (list_empty(&cache->pending[level + 1]))
+                return 1;
+        node = list_entry(cache->pending[level + 1].next,
+                          struct backref_node, lower);
+        if (node->eb)
+                ret = add_child_blocks(trans, rc, node, blocks);
+        *upper = node;
+        return ret;
+}
+static int get_tree_block_key(struct reloc_control *rc,
+                              struct tree_block *block)
+{
+        struct extent_buffer *eb;
+        BUG_ON(block->key_ready);
+        eb = read_tree_block(rc->extent_root, block->bytenr,
+                             block->key.objectid, block->key.offset);
+        WARN_ON(btrfs_header_level(eb) != block->level);
+        if (block->level == 0)
+                btrfs_item_key_to_cpu(eb, &block->key, 0);
+        else
+                btrfs_node_key_to_cpu(eb, &block->key, 0);
+        free_extent_buffer(eb);
+        block->key_ready = 1;
+        return 0;
+}
+static int reada_tree_block(struct reloc_control *rc,
+                            struct tree_block *block)
+{
+        BUG_ON(block->key_ready);
+        readahead_tree_block(rc->extent_root, block->bytenr,
+                             block->key.objectid, block->key.offset);
+        return 0;
+}
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+                                struct reloc_control *rc,
+                                struct backref_node *node,
+                                struct btrfs_key *key,
+                                struct btrfs_path *path)
+{
+        struct btrfs_root *root;
+        int ret;
+        root = select_one_root(trans, node);
+        if (unlikely(!root)) {
+                rc->found_old_snapshot = 1;
+                update_processed_blocks(rc, node);
+                return 0;
+        }
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                ret = do_relocation(trans, node, key, path, 1);
+                if (ret < 0)
+                        goto out;
+                if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+                        ret = replace_file_extents(trans, rc, root,
+                                                   node->eb, NULL);
+                        if (ret < 0)
+                                goto out;
+                }
+                drop_node_buffer(node);
+        } else if (!root->ref_cows) {
+                path->lowest_level = node->level;
+                ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                btrfs_release_path(root, path);
+                if (ret < 0)
+                        goto out;
+        } else if (root != node->root) {
+                WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+        }
+        update_processed_blocks(rc, node);
+        ret = 0;
+out:
+        drop_node_buffer(node);
+        return ret;
+}
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc, struct rb_root *blocks)
+{
+        struct backref_cache *cache;
+        struct backref_node *node;
+        struct btrfs_path *path;
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        int level = -1;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        cache = kmalloc(sizeof(*cache), GFP_NOFS);
+        if (!cache) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
+        backref_cache_init(cache);
+        rb_node = rb_first(blocks);
+        while (rb_node) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                if (level == -1)
+                        level = block->level;
+                else
+                        BUG_ON(level != block->level);
+                if (!block->key_ready)
+                        reada_tree_block(rc, block);
+                rb_node = rb_next(rb_node);
+        }
+        rb_node = rb_first(blocks);
+        while (rb_node) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                if (!block->key_ready)
+                        get_tree_block_key(rc, block);
+                rb_node = rb_next(rb_node);
+        }
+        rb_node = rb_first(blocks);
+        while (rb_node) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                node = build_backref_tree(rc, cache, &block->key,
+                                          block->level, block->bytenr);
+                if (IS_ERR(node)) {
+                        err = PTR_ERR(node);
+                        goto out;
+                }
+                ret = relocate_tree_block(trans, rc, node, &block->key,
+                                          path);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                remove_backref_node(cache, node);
+                rb_node = rb_next(rb_node);
+        }
+        if (level > 0)
+                goto out;
+        free_block_list(blocks);
+        /*
+         * now backrefs of some upper level tree blocks have been cached,
+         * try relocating blocks referenced by these upper level blocks.
+         */
+        while (1) {
+                struct backref_node *upper = NULL;
+                if (trans->transaction->in_commit ||
+                    trans->transaction->delayed_refs.flushing)
+                        break;
+                ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+                                          &upper);
+                if (ret < 0)
+                        err = ret;
+                if (ret != 0)
+                        break;
+                rb_node = rb_first(blocks);
+                while (rb_node) {
+                        block = rb_entry(rb_node, struct tree_block, rb_node);
+                        if (trans->transaction->in_commit ||
+                            trans->transaction->delayed_refs.flushing)
+                                goto out;
+                        BUG_ON(!block->key_ready);
+                        node = build_backref_tree(rc, cache, &block->key,
+                                                  level, block->bytenr);
+                        if (IS_ERR(node)) {
+                                err = PTR_ERR(node);
+                                goto out;
+                        }
+                        ret = relocate_tree_block(trans, rc, node,
+                                                  &block->key, path);
+                        if (ret < 0) {
+                                err = ret;
+                                goto out;
+                        }
+                        remove_backref_node(cache, node);
+                        rb_node = rb_next(rb_node);
+                }
+                free_block_list(blocks);
+                if (upper) {
+                        ret = link_to_upper(trans, upper, path);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        remove_backref_node(cache, upper);
+                }
+        }
+out:
+        free_block_list(blocks);
+        ret = finish_pending_nodes(trans, cache, path);
+        if (ret < 0)
+                err = ret;
+        kfree(cache);
+        btrfs_free_path(path);
+        return err;
+}
+static noinline_for_stack
+int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+{
+        u64 page_start;
+        u64 page_end;
+        unsigned long i;
+        unsigned long first_index;
+        unsigned long last_index;
+        unsigned int total_read = 0;
+        unsigned int total_dirty = 0;
+        struct page *page;
+        struct file_ra_state *ra;
+        struct btrfs_ordered_extent *ordered;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int ret = 0;
+        ra = kzalloc(sizeof(*ra), GFP_NOFS);
+        if (!ra)
+                return -ENOMEM;
+        mutex_lock(&inode->i_mutex);
+        first_index = start >> PAGE_CACHE_SHIFT;
+        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+        /* make sure the dirty trick played by the caller work */
+        while (1) {
+                ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                                    first_index, last_index);
+                if (ret != -EBUSY)
+                        break;
+                schedule_timeout(HZ/10);
+        }
+        if (ret)
+                goto out_unlock;
+        file_ra_state_init(ra, inode->i_mapping);
+        for (i = first_index ; i <= last_index; i++) {
+                if (total_read % ra->ra_pages == 0) {
+                        btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+                                min(last_index, ra->ra_pages + i - 1));
+                }
+                total_read++;
+again:
+                if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+                        BUG_ON(1);
+                page = grab_cache_page(inode->i_mapping, i);
+                if (!page) {
+                        ret = -ENOMEM;
+                        goto out_unlock;
+                }
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                ret = -EIO;
+                                goto out_unlock;
+                        }
+                }
+                wait_on_page_writeback(page);
+                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+                page_end = page_start + PAGE_CACHE_SIZE - 1;
+                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, page_start);
+                if (ordered) {
+                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                        goto again;
+                }
+                set_page_extent_mapped(page);
+                if (i == first_index)
+                        set_extent_bits(io_tree, page_start, page_end,
+                                        EXTENT_BOUNDARY, GFP_NOFS);
+                btrfs_set_extent_delalloc(inode, page_start, page_end);
+                set_page_dirty(page);
+                total_dirty++;
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_page(page);
+                page_cache_release(page);
+        }
+out_unlock:
+        mutex_unlock(&inode->i_mutex);
+        kfree(ra);
+        balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+        return ret;
+}
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
+        u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
+        u64 end = start + extent_key->offset - 1;
+        em = alloc_extent_map(GFP_NOFS);
+        em->start = start;
+        em->len = extent_key->offset;
+        em->block_len = extent_key->offset;
+        em->block_start = extent_key->objectid;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        /* setup extent map to cheat btrfs_readpage */
+        lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+        while (1) {
+                int ret;
+                spin_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                spin_unlock(&em_tree->lock);
+                if (ret != -EEXIST) {
+                        free_extent_map(em);
+                        break;
+                }
+                btrfs_drop_extent_cache(inode, start, end, 0);
+        }
+        unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+        return relocate_inode_pages(inode, start, extent_key->offset);
+}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int get_ref_objectid_v0(struct reloc_control *rc,
+                               struct btrfs_path *path,
+                               struct btrfs_key *extent_key,
+                               u64 *ref_objectid, int *path_change)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_ref_v0 *ref0;
+        int ret;
+        int slot;
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        while (1) {
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(rc->extent_root, path);
+                        if (ret < 0)
+                                return ret;
+                        BUG_ON(ret > 0);
+                        leaf = path->nodes[0];
+                        slot = path->slots[0];
+                        if (path_change)
+                                *path_change = 1;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.objectid != extent_key->objectid)
+                        return -ENOENT;
+                if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
+                        slot++;
+                        continue;
+                }
+                ref0 = btrfs_item_ptr(leaf, slot,
+                                struct btrfs_extent_ref_v0);
+                *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
+                break;
+        }
+        return 0;
+}
+#endif
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+                          struct btrfs_key *extent_key,
+                          struct btrfs_path *path,
+                          struct rb_root *blocks)
+{
+        struct extent_buffer *eb;
+        struct btrfs_extent_item *ei;
+        struct btrfs_tree_block_info *bi;
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        u32 item_size;
+        int level = -1;
+        int generation;
+        eb =  path->nodes[0];
+        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+        if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+                ei = btrfs_item_ptr(eb, path->slots[0],
+                                struct btrfs_extent_item);
+                bi = (struct btrfs_tree_block_info *)(ei + 1);
+                generation = btrfs_extent_generation(eb, ei);
+                level = btrfs_tree_block_level(eb, bi);
+        } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                u64 ref_owner;
+                int ret;
+                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+                ret = get_ref_objectid_v0(rc, path, extent_key,
+                                          &ref_owner, NULL);
+                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
+                level = (int)ref_owner;
+                /* FIXME: get real generation */
+                generation = 0;
+#else
+                BUG();
+#endif
+        }
+        btrfs_release_path(rc->extent_root, path);
+        BUG_ON(level == -1);
+        block = kmalloc(sizeof(*block), GFP_NOFS);
+        if (!block)
+                return -ENOMEM;
+        block->bytenr = extent_key->objectid;
+        block->key.objectid = extent_key->offset;
+        block->key.offset = generation;
+        block->level = level;
+        block->key_ready = 0;
+        rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+        BUG_ON(rb_node);
+        return 0;
+}
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+                            u64 bytenr, u32 blocksize,
+                            struct rb_root *blocks)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret;
+        if (tree_block_processed(bytenr, blocksize, rc))
+                return 0;
+        if (tree_search(blocks, bytenr))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = blocksize;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret);
+        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+        ret = add_tree_block(rc, &key, path, blocks);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to check if the block use full backrefs for pointers in it
+ */
+static int block_use_full_backref(struct reloc_control *rc,
+                                  struct extent_buffer *eb)
+{
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct btrfs_key key;
+        u64 flags;
+        int ret;
+        if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
+            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
+                return 1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = eb->start;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = eb->len;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, rc->extent_root,
+                                &key, path, 0, 0);
+        BUG_ON(ret);
+        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                            struct btrfs_extent_item);
+        flags = btrfs_extent_flags(path->nodes[0], ei);
+        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                ret = 1;
+        else
+                ret = 0;
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
+ * this function scans fs tree to find blocks reference the data extent
+ */
+static int find_data_references(struct reloc_control *rc,
+                                struct btrfs_key *extent_key,
+                                struct extent_buffer *leaf,
+                                struct btrfs_extent_data_ref *ref,
+                                struct rb_root *blocks)
+{
+        struct btrfs_path *path;
+        struct tree_block *block;
+        struct btrfs_root *root;
+        struct btrfs_file_extent_item *fi;
+        struct rb_node *rb_node;
+        struct btrfs_key key;
+        u64 ref_root;
+        u64 ref_objectid;
+        u64 ref_offset;
+        u32 ref_count;
+        u32 nritems;
+        int err = 0;
+        int added = 0;
+        int counted;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ref_root = btrfs_extent_data_ref_root(leaf, ref);
+        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
+        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
+        ref_count = btrfs_extent_data_ref_count(leaf, ref);
+        root = read_fs_root(rc->extent_root->fs_info, ref_root);
+        if (IS_ERR(root)) {
+                err = PTR_ERR(root);
+                goto out;
+        }
+        key.objectid = ref_objectid;
+        key.offset = ref_offset;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        /*
+         * the references in tree blocks that use full backrefs
+         * are not counted in
+         */
+        if (block_use_full_backref(rc, leaf))
+                counted = 0;
+        else
+                counted = 1;
+        rb_node = tree_search(blocks, leaf->start);
+        if (rb_node) {
+                if (counted)
+                        added = 1;
+                else
+                        path->slots[0] = nritems;
+        }
+        while (ref_count > 0) {
+                while (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0) {
+                                err = ret;
+                                goto out;
+                        }
+                        if (ret > 0) {
+                                WARN_ON(1);
+                                goto out;
+                        }
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        added = 0;
+                        if (block_use_full_backref(rc, leaf))
+                                counted = 0;
+                        else
+                                counted = 1;
+                        rb_node = tree_search(blocks, leaf->start);
+                        if (rb_node) {
+                                if (counted)
+                                        added = 1;
+                                else
+                                        path->slots[0] = nritems;
+                        }
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != ref_objectid ||
+                    key.type != BTRFS_EXTENT_DATA_KEY) {
+                        WARN_ON(1);
+                        break;
+                }
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        goto next;
+                if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+                    extent_key->objectid)
+                        goto next;
+                key.offset -= btrfs_file_extent_offset(leaf, fi);
+                if (key.offset != ref_offset)
+                        goto next;
+                if (counted)
+                        ref_count--;
+                if (added)
+                        goto next;
+                if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+                        block = kmalloc(sizeof(*block), GFP_NOFS);
+                        if (!block) {
+                                err = -ENOMEM;
+                                break;
+                        }
+                        block->bytenr = leaf->start;
+                        btrfs_item_key_to_cpu(leaf, &block->key, 0);
+                        block->level = 0;
+                        block->key_ready = 1;
+                        rb_node = tree_insert(blocks, block->bytenr,
+                                              &block->rb_node);
+                        BUG_ON(rb_node);
+                }
+                if (counted)
+                        added = 1;
+                else
+                        path->slots[0] = nritems;
+next:
+                path->slots[0]++;
+        }
+out:
+        btrfs_free_path(path);
+        return err;
+}
+/*
+ * hepler to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+                        struct btrfs_key *extent_key,
+                        struct btrfs_path *path,
+                        struct rb_root *blocks)
+{
+        struct btrfs_key key;
+        struct extent_buffer *eb;
+        struct btrfs_extent_data_ref *dref;
+        struct btrfs_extent_inline_ref *iref;
+        unsigned long ptr;
+        unsigned long end;
+        u32 blocksize;
+        int ret;
+        int err = 0;
+        ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
+                               extent_key->offset);
+        BUG_ON(ret < 0);
+        if (ret > 0) {
+                /* the relocated data is fragmented */
+                rc->extents_skipped++;
+                btrfs_release_path(rc->extent_root, path);
+                return 0;
+        }
+        blocksize = btrfs_level_size(rc->extent_root, 0);
+        eb = path->nodes[0];
+        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
+                ptr = end;
+        else
+#endif
+                ptr += sizeof(struct btrfs_extent_item);
+        while (ptr < end) {
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                key.type = btrfs_extent_inline_ref_type(eb, iref);
+                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+                        ret = __add_tree_block(rc, key.offset, blocksize,
+                                               blocks);
+                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        ret = find_data_references(rc, extent_key,
+                                                   eb, dref, blocks);
+                } else {
+                        BUG();
+                }
+                ptr += btrfs_extent_inline_ref_size(key.type);
+        }
+        WARN_ON(ptr > end);
+        while (1) {
+                cond_resched();
+                eb = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(eb)) {
+                        ret = btrfs_next_leaf(rc->extent_root, path);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        if (ret > 0)
+                                break;
+                        eb = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+                if (key.objectid != extent_key->objectid)
+                        break;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
+                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+#else
+                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+#endif
+                        ret = __add_tree_block(rc, key.offset, blocksize,
+                                               blocks);
+                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        dref = btrfs_item_ptr(eb, path->slots[0],
+                                              struct btrfs_extent_data_ref);
+                        ret = find_data_references(rc, extent_key,
+                                                   eb, dref, blocks);
+                } else {
+                        ret = 0;
+                }
+                if (ret) {
+                        err = ret;
+                        break;
+                }
+                path->slots[0]++;
+        }
+        btrfs_release_path(rc->extent_root, path);
+        if (err)
+                free_block_list(blocks);
+        return err;
+}
+/*
+ * hepler to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct btrfs_trans_handle *trans,
+                     struct reloc_control *rc, struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        u64 start, end, last;
+        int ret;
+        last = rc->block_group->key.objectid + rc->block_group->key.offset;
+        while (1) {
+                cond_resched();
+                if (rc->search_start >= last) {
+                        ret = 1;
+                        break;
+                }
+                key.objectid = rc->search_start;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = 0;
+                path->search_commit_root = 1;
+                path->skip_locking = 1;
+                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+                                        0, 0);
+                if (ret < 0)
+                        break;
+next:
+                leaf = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(rc->extent_root, path);
+                        if (ret != 0)
+                                break;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid >= last) {
+                        ret = 1;
+                        break;
+                }
+                if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+                    key.objectid + key.offset <= rc->search_start) {
+                        path->slots[0]++;
+                        goto next;
+                }
+                ret = find_first_extent_bit(&rc->processed_blocks,
+                                            key.objectid, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret == 0 && start <= key.objectid) {
+                        btrfs_release_path(rc->extent_root, path);
+                        rc->search_start = end + 1;
+                } else {
+                        rc->search_start = key.objectid + key.offset;
+                        return 0;
+                }
+        }
+        btrfs_release_path(rc->extent_root, path);
+        return ret;
+}
+static void set_reloc_control(struct reloc_control *rc)
+{
+        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+        mutex_lock(&fs_info->trans_mutex);
+        fs_info->reloc_ctl = rc;
+        mutex_unlock(&fs_info->trans_mutex);
+}
+static void unset_reloc_control(struct reloc_control *rc)
+{
+        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+        mutex_lock(&fs_info->trans_mutex);
+        fs_info->reloc_ctl = NULL;
+        mutex_unlock(&fs_info->trans_mutex);
+}
+static int check_extent_flags(u64 flags)
+{
+        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+                return 1;
+        if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
+            !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+                return 1;
+        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                return 1;
+        return 0;
+}
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+        struct rb_root blocks = RB_ROOT;
+        struct btrfs_key key;
+        struct btrfs_trans_handle *trans = NULL;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        unsigned long nr;
+        u64 flags;
+        u32 item_size;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        rc->search_start = rc->block_group->key.objectid;
+        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+                          GFP_NOFS);
+        rc->create_reloc_root = 1;
+        set_reloc_control(rc);
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        while (1) {
+                trans = btrfs_start_transaction(rc->extent_root, 1);
+                ret = find_next_extent(trans, rc, path);
+                if (ret < 0)
+                        err = ret;
+                if (ret != 0)
+                        break;
+                rc->extents_found++;
+                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                    struct btrfs_extent_item);
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                item_size = btrfs_item_size_nr(path->nodes[0],
+                                               path->slots[0]);
+                if (item_size >= sizeof(*ei)) {
+                        flags = btrfs_extent_flags(path->nodes[0], ei);
+                        ret = check_extent_flags(flags);
+                        BUG_ON(ret);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        u64 ref_owner;
+                        int path_change = 0;
+                        BUG_ON(item_size !=
+                               sizeof(struct btrfs_extent_item_v0));
+                        ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
+                                                  &path_change);
+                        if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
+                                flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+                        else
+                                flags = BTRFS_EXTENT_FLAG_DATA;
+                        if (path_change) {
+                                btrfs_release_path(rc->extent_root, path);
+                                path->search_commit_root = 1;
+                                path->skip_locking = 1;
+                                ret = btrfs_search_slot(NULL, rc->extent_root,
+                                                        &key, path, 0, 0);
+                                if (ret < 0) {
+                                        err = ret;
+                                        break;
+                                }
+                                BUG_ON(ret > 0);
+                        }
+#else
+                        BUG();
+#endif
+                }
+                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                        ret = add_tree_block(rc, &key, path, &blocks);
+                } else if (rc->stage == UPDATE_DATA_PTRS &&
+                         (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                        ret = add_data_references(rc, &key, path, &blocks);
+                } else {
+                        btrfs_release_path(rc->extent_root, path);
+                        ret = 0;
+                }
+                if (ret < 0) {
+                        err = 0;
+                        break;
+                }
+                if (!RB_EMPTY_ROOT(&blocks)) {
+                        ret = relocate_tree_blocks(trans, rc, &blocks);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                }
+                nr = trans->blocks_used;
+                btrfs_end_transaction_throttle(trans, rc->extent_root);
+                trans = NULL;
+                btrfs_btree_balance_dirty(rc->extent_root, nr);
+                if (rc->stage == MOVE_DATA_EXTENTS &&
+                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                        rc->found_file_extent = 1;
+                        ret = relocate_data_extent(rc->data_inode, &key);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                }
+        }
+        btrfs_free_path(path);
+        if (trans) {
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, rc->extent_root);
+                btrfs_btree_balance_dirty(rc->extent_root, nr);
+        }
+        rc->create_reloc_root = 0;
+        smp_mb();
+        if (rc->extents_found > 0) {
+                trans = btrfs_start_transaction(rc->extent_root, 1);
+                btrfs_commit_transaction(trans, rc->extent_root);
+        }
+        merge_reloc_roots(rc);
+        unset_reloc_control(rc);
+        /* get rid of pinned extents */
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        return err;
+}
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 u64 objectid, u64 size)
+{
+        struct btrfs_path *path;
+        struct btrfs_inode_item *item;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+        if (ret)
+                goto out;
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+        btrfs_set_inode_generation(leaf, item, 1);
+        btrfs_set_inode_size(leaf, item, size);
+        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_block_group_cache *group)
+{
+        struct inode *inode = NULL;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        struct btrfs_key key;
+        unsigned long nr;
+        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+        int err = 0;
+        root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+        if (IS_ERR(root))
+                return ERR_CAST(root);
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+        if (err)
+                goto out;
+        err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+        BUG_ON(err);
+        err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+                                       group->key.offset, 0, group->key.offset,
+                                       0, 0, 0);
+        BUG_ON(err);
+        key.objectid = objectid;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(root->fs_info->sb, &key, root);
+        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
+        BTRFS_I(inode)->index_cnt = group->key.objectid;
+        err = btrfs_orphan_add(trans, inode);
+out:
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        if (err) {
+                if (inode)
+                        iput(inode);
+                inode = ERR_PTR(err);
+        }
+        return inode;
+}
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+{
+        struct btrfs_fs_info *fs_info = extent_root->fs_info;
+        struct reloc_control *rc;
+        int ret;
+        int err = 0;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc)
+                return -ENOMEM;
+        mapping_tree_init(&rc->reloc_root_tree);
+        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
+        BUG_ON(!rc->block_group);
+        btrfs_init_workers(&rc->workers, "relocate",
+                           fs_info->thread_pool_size);
+        rc->extent_root = extent_root;
+        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+        if (IS_ERR(rc->data_inode)) {
+                err = PTR_ERR(rc->data_inode);
+                rc->data_inode = NULL;
+                goto out;
+        }
+        printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+               (unsigned long long)rc->block_group->key.objectid,
+               (unsigned long long)rc->block_group->flags);
+        btrfs_start_delalloc_inodes(fs_info->tree_root);
+        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+        while (1) {
+                mutex_lock(&fs_info->cleaner_mutex);
+                btrfs_clean_old_snapshots(fs_info->tree_root);
+                mutex_unlock(&fs_info->cleaner_mutex);
+                rc->extents_found = 0;
+                rc->extents_skipped = 0;
+                ret = relocate_block_group(rc);
+                if (ret < 0) {
+                        err = ret;
+                        break;
+                }
+                if (rc->extents_found == 0)
+                        break;
+                printk(KERN_INFO "btrfs: found %llu extents\n",
+                        (unsigned long long)rc->extents_found);
+                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+                        btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+                        invalidate_mapping_pages(rc->data_inode->i_mapping,
+                                                 0, -1);
+                        rc->stage = UPDATE_DATA_PTRS;
+                } else if (rc->stage == UPDATE_DATA_PTRS &&
+                           rc->extents_skipped >= rc->extents_found) {
+                        iput(rc->data_inode);
+                        rc->data_inode = create_reloc_inode(fs_info,
+                                                            rc->block_group);
+                        if (IS_ERR(rc->data_inode)) {
+                                err = PTR_ERR(rc->data_inode);
+                                rc->data_inode = NULL;
+                                break;
+                        }
+                        rc->stage = MOVE_DATA_EXTENTS;
+                        rc->found_file_extent = 0;
+                }
+        }
+        filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
+                                 rc->block_group->key.objectid,
+                                 rc->block_group->key.objectid +
+                                 rc->block_group->key.offset - 1);
+        WARN_ON(rc->block_group->pinned > 0);
+        WARN_ON(rc->block_group->reserved > 0);
+        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+out:
+        iput(rc->data_inode);
+        btrfs_stop_workers(&rc->workers);
+        btrfs_put_block_group(rc->block_group);
+        kfree(rc);
+        return err;
+}
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_root *root)
+{
+        LIST_HEAD(reloc_roots);
+        struct btrfs_key key;
+        struct btrfs_root *fs_root;
+        struct btrfs_root *reloc_root;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct reloc_control *rc = NULL;
+        struct btrfs_trans_handle *trans;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        key.offset = (u64)-1;
+        while (1) {
+                ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+                                        path, 0, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                btrfs_release_path(root->fs_info->tree_root, path);
+                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+                    key.type != BTRFS_ROOT_ITEM_KEY)
+                        break;
+                reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+                if (IS_ERR(reloc_root)) {
+                        err = PTR_ERR(reloc_root);
+                        goto out;
+                }
+                list_add(&reloc_root->root_list, &reloc_roots);
+                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+                        fs_root = read_fs_root(root->fs_info,
+                                               reloc_root->root_key.offset);
+                        if (IS_ERR(fs_root)) {
+                                err = PTR_ERR(fs_root);
+                                goto out;
+                        }
+                }
+                if (key.offset == 0)
+                        break;
+                key.offset--;
+        }
+        btrfs_release_path(root->fs_info->tree_root, path);
+        if (list_empty(&reloc_roots))
+                goto out;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc) {
+                err = -ENOMEM;
+                goto out;
+        }
+        mapping_tree_init(&rc->reloc_root_tree);
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        btrfs_init_workers(&rc->workers, "relocate",
+                           root->fs_info->thread_pool_size);
+        rc->extent_root = root->fs_info->extent_root;
+        set_reloc_control(rc);
+        while (!list_empty(&reloc_roots)) {
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del(&reloc_root->root_list);
+                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+                        list_add_tail(&reloc_root->root_list,
+                                      &rc->reloc_roots);
+                        continue;
+                }
+                fs_root = read_fs_root(root->fs_info,
+                                       reloc_root->root_key.offset);
+                BUG_ON(IS_ERR(fs_root));
+                __add_reloc_root(reloc_root);
+                fs_root->reloc_root = reloc_root;
+        }
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        merge_reloc_roots(rc);
+        unset_reloc_control(rc);
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+out:
+        if (rc) {
+                btrfs_stop_workers(&rc->workers);
+                kfree(rc);
+        }
+        while (!list_empty(&reloc_roots)) {
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del(&reloc_root->root_list);
+                free_extent_buffer(reloc_root->node);
+                free_extent_buffer(reloc_root->commit_root);
+                kfree(reloc_root);
+        }
+        btrfs_free_path(path);
+        if (err == 0) {
+                /* cleanup orphan inode in data relocation tree */
+                fs_root = read_fs_root(root->fs_info,
+                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
+                if (IS_ERR(fs_root))
+                        err = PTR_ERR(fs_root);
+        }
+        return err;
+}
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_ordered_extent *ordered;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        size_t offset;
+        int ret;
+        u64 disk_bytenr;
+        LIST_HEAD(list);
+        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+                                       disk_bytenr + len - 1, &list);
+        while (!list_empty(&list)) {
+                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+                list_del_init(&sums->list);
+                sector_sum = sums->sums;
+                sums->bytenr = ordered->start;
+                offset = 0;
+                while (offset < sums->len) {
+                        sector_sum->bytenr += ordered->start - disk_bytenr;
+                        sector_sum++;
+                        offset += root->sectorsize;
+                }
+                btrfs_add_ordered_sum(inode, ordered, sums);
+        }
+        btrfs_put_ordered_extent(ordered);
+        return 0;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index b48650de4472..0ddc6d61c55a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -111,6 +111,15 @@ out:
        return ret;
 }
+int btrfs_set_root_node(struct btrfs_root_item *item,
+                        struct extent_buffer *node)
+{
+        btrfs_set_root_bytenr(item, node->start);
+        btrfs_set_root_level(item, btrfs_header_level(node));
+        btrfs_set_root_generation(item, btrfs_header_generation(node));
+        return 0;
+}
 /*
 * copy the data in 'item' into the btree
 */
@@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 * offset lower than the latest root.  They need to be queued for deletion to
 * finish what was happening when we crashed.
 */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-                          struct btrfs_root *latest)
 {
        struct btrfs_root *dead_root;
        struct btrfs_item *item;
@@ -227,10 +235,7 @@ again:
                        goto err;
                }
-                if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+                ret = btrfs_add_dead_root(dead_root);
-                        ret = btrfs_add_dead_reloc_root(dead_root);
-                else
-                        ret = btrfs_add_dead_root(dead_root, latest);
                if (ret)
                        goto err;
                goto again;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2ff7cd2db25f..6d6d06cb6dfc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
 #include <linux/mpage.h>
@@ -52,7 +51,6 @@
 #include "export.h"
 #include "compression.h"
 static struct super_operations btrfs_super_ops;
 static void btrfs_put_super(struct super_block *sb)
@@ -67,8 +65,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
+        Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
-        Opt_ratio, Opt_flushoncommit, Opt_err,
+        Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 static match_table_t tokens = {
@@ -84,6 +82,8 @@ static match_table_t tokens = {
        {Opt_thread_pool, "thread_pool=%d"},
        {Opt_compress, "compress"},
        {Opt_ssd, "ssd"},
+        {Opt_ssd_spread, "ssd_spread"},
+        {Opt_nossd, "nossd"},
        {Opt_noacl, "noacl"},
        {Opt_notreelog, "notreelog"},
        {Opt_flushoncommit, "flushoncommit"},
@@ -158,7 +158,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                         */
                        break;
                case Opt_nodatasum:
-                        printk(KERN_INFO "btrfs: setting nodatacsum\n");
+                        printk(KERN_INFO "btrfs: setting nodatasum\n");
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
                case Opt_nodatacow:
@@ -174,6 +174,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
                        btrfs_set_opt(info->mount_opt, SSD);
                        break;
+                case Opt_ssd_spread:
+                        printk(KERN_INFO "btrfs: use spread ssd "
+                               "allocation scheme\n");
+                        btrfs_set_opt(info->mount_opt, SSD);
+                        btrfs_set_opt(info->mount_opt, SSD_SPREAD);
+                        break;
+                case Opt_nossd:
+                        printk(KERN_INFO "btrfs: not using ssd allocation "
+                               "scheme\n");
+                        btrfs_set_opt(info->mount_opt, NOSSD);
+                        btrfs_clear_opt(info->mount_opt, SSD);
+                        btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+                        break;
                case Opt_nobarrier:
                        printk(KERN_INFO "btrfs: turning off barriers\n");
                        btrfs_set_opt(info->mount_opt, NOBARRIER);
@@ -322,7 +335,7 @@ static int btrfs_fill_super(struct super_block *sb,
        struct dentry *root_dentry;
        struct btrfs_super_block *disk_super;
        struct btrfs_root *tree_root;
-        struct btrfs_inode *bi;
+        struct btrfs_key key;
        int err;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -341,23 +354,15 @@ static int btrfs_fill_super(struct super_block *sb,
        }
        sb->s_fs_info = tree_root;
        disk_super = &tree_root->fs_info->super_copy;
-        inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
-                                  tree_root->fs_info->fs_root);
-        bi = BTRFS_I(inode);
-        bi->location.objectid = inode->i_ino;
-        bi->location.offset = 0;
-        bi->root = tree_root->fs_info->fs_root;
-        btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
-        if (!inode) {
+        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
-                err = -ENOMEM;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
                goto fail_close;
        }
-        if (inode->i_state & I_NEW) {
-                btrfs_read_locked_inode(inode);
-                unlock_new_inode(inode);
-        }
        root_dentry = d_alloc_root(inode);
        if (!root_dentry) {
@@ -388,10 +393,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        struct btrfs_root *root = btrfs_sb(sb);
        int ret;
-        if (sb->s_flags & MS_RDONLY)
-                return 0;
-        sb->s_dirt = 0;
        if (!wait) {
                filemap_flush(root->fs_info->btree_inode->i_mapping);
                return 0;
@@ -402,7 +403,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
-        sb->s_dirt = 0;
        return ret;
 }
@@ -433,7 +433,11 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
        if (btrfs_test_opt(root, COMPRESS))
                seq_puts(seq, ",compress");
-        if (btrfs_test_opt(root, SSD))
+        if (btrfs_test_opt(root, NOSSD))
+                seq_puts(seq, ",nossd");
+        if (btrfs_test_opt(root, SSD_SPREAD))
+                seq_puts(seq, ",ssd_spread");
+        else if (btrfs_test_opt(root, SSD))
                seq_puts(seq, ",ssd");
        if (btrfs_test_opt(root, NOTREELOG))
                seq_puts(seq, ",notreelog");
@@ -444,11 +448,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        return 0;
 }
-static void btrfs_write_super(struct super_block *sb)
-{
-        sb->s_dirt = 0;
-}
 static int btrfs_test_super(struct super_block *s, void *data)
 {
        struct btrfs_fs_devices *test_fs_devices = data;
@@ -584,7 +583,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
                        return -EINVAL;
-                ret = btrfs_cleanup_reloc_trees(root);
+                /* recover relocation */
+                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
                ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -678,7 +678,6 @@ static int btrfs_unfreeze(struct super_block *sb)
 static struct super_operations btrfs_super_ops = {
        .delete_inode   = btrfs_delete_inode,
        .put_super      = btrfs_put_super,
-        .write_super    = btrfs_write_super,
        .sync_fs        = btrfs_sync_fs,
        .show_options   = btrfs_show_options,
        .write_inode    = btrfs_write_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01b143605ec1..cdbb5022da52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,7 +25,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 #define BTRFS_ROOT_TRANS_TAG 0
@@ -41,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
        }
 }
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+        free_extent_buffer(root->commit_root);
+        root->commit_root = btrfs_root_node(root);
+}
 /*
 * either allocate a new transaction or hop into the existing one
 */
@@ -94,45 +99,37 @@ static noinline int join_transaction(struct btrfs_root *root)
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
 */
-noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
+static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+                                         struct btrfs_root *root)
 {
-        struct btrfs_dirty_root *dirty;
+        if (root->ref_cows && root->last_trans < trans->transid) {
-        u64 running_trans_id = root->fs_info->running_transaction->transid;
-        if (root->ref_cows && root->last_trans < running_trans_id) {
                WARN_ON(root == root->fs_info->extent_root);
-                if (root->root_item.refs != 0) {
+                WARN_ON(root->root_item.refs == 0);
-                        radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                WARN_ON(root->commit_root != root->node);
-                                   (unsigned long)root->root_key.objectid,
-                                   BTRFS_ROOT_TRANS_TAG);
+                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                           (unsigned long)root->root_key.objectid,
-                        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+                           BTRFS_ROOT_TRANS_TAG);
-                        BUG_ON(!dirty);
+                root->last_trans = trans->transid;
-                        dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+                btrfs_init_reloc_root(trans, root);
-                        BUG_ON(!dirty->root);
+        }
-                        dirty->latest_root = root;
+        return 0;
-                        INIT_LIST_HEAD(&dirty->list);
+}
-                        root->commit_root = btrfs_root_node(root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
-                        memcpy(dirty->root, root, sizeof(*root));
+{
-                        spin_lock_init(&dirty->root->node_lock);
+        if (!root->ref_cows)
-                        spin_lock_init(&dirty->root->list_lock);
+                return 0;
-                        mutex_init(&dirty->root->objectid_mutex);
-                        mutex_init(&dirty->root->log_mutex);
+        mutex_lock(&root->fs_info->trans_mutex);
-                        INIT_LIST_HEAD(&dirty->root->dead_list);
+        if (root->last_trans == trans->transid) {
-                        dirty->root->node = root->commit_root;
+                mutex_unlock(&root->fs_info->trans_mutex);
-                        dirty->root->commit_root = NULL;
+                return 0;
-                        spin_lock(&root->list_lock);
-                        list_add(&dirty->root->dead_list, &root->dead_list);
-                        spin_unlock(&root->list_lock);
-                        root->dirty_root = dirty;
-                } else {
-                        WARN_ON(1);
-                }
-                root->last_trans = running_trans_id;
        }
+        record_root_in_trans(trans, root);
+        mutex_unlock(&root->fs_info->trans_mutex);
        return 0;
 }
@@ -181,7 +178,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        ret = join_transaction(root);
        BUG_ON(ret);
-        btrfs_record_root_in_trans(root);
        h->transid = root->fs_info->running_transaction->transid;
        h->transaction = root->fs_info->running_transaction;
        h->blocks_reserved = num_blocks;
@@ -192,6 +188,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        h->delayed_ref_updates = 0;
        root->fs_info->running_transaction->use_count++;
+        record_root_in_trans(h, root);
        mutex_unlock(&root->fs_info->trans_mutex);
        return h;
 }
@@ -233,6 +230,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
        return 0;
 }
+#if 0
 /*
 * rate limit against the drop_snapshot code.  This helps to slow down new
 * operations if the drop_snapshot code isn't able to keep up.
@@ -273,6 +271,7 @@ harder:
                        goto harder;
        }
 }
+#endif
 void btrfs_throttle(struct btrfs_root *root)
 {
@@ -280,7 +279,6 @@ void btrfs_throttle(struct btrfs_root *root)
        if (!root->fs_info->open_ioctl_trans)
                wait_current_trans(root);
        mutex_unlock(&root->fs_info->trans_mutex);
-        throttle_on_drops(root);
 }
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -323,9 +321,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        memset(trans, 0, sizeof(*trans));
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
-        if (throttle)
-                throttle_on_drops(root);
        return 0;
 }
@@ -455,36 +450,32 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        btrfs_write_dirty_block_groups(trans, root);
-        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-        BUG_ON(ret);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start)
                        break;
-                btrfs_set_root_bytenr(&root->root_item,
-                                       root->node->start);
-                btrfs_set_root_level(&root->root_item,
-                                     btrfs_header_level(root->node));
-                btrfs_set_root_generation(&root->root_item, trans->transid);
+                btrfs_set_root_node(&root->root_item, root->node);
                ret = btrfs_update_root(trans, tree_root,
                                        &root->root_key,
                                        &root->root_item);
                BUG_ON(ret);
-                btrfs_write_dirty_block_groups(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                ret = btrfs_write_dirty_block_groups(trans, root);
                BUG_ON(ret);
        }
+        if (root != root->fs_info->extent_root)
+                switch_commit_root(root);
        return 0;
 }
 /*
 * update all the cowonly tree roots on disk
 */
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root)
+                                         struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct list_head *next;
@@ -508,10 +499,12 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
                root = list_entry(next, struct btrfs_root, dirty_list);
                update_cowonly_root(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-                BUG_ON(ret);
        }
+        down_write(&fs_info->extent_commit_sem);
+        switch_commit_root(fs_info->extent_root);
+        up_write(&fs_info->extent_commit_sem);
        return 0;
 }
@@ -520,118 +513,53 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
+int btrfs_add_dead_root(struct btrfs_root *root)
 {
-        struct btrfs_dirty_root *dirty;
-        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-        if (!dirty)
-                return -ENOMEM;
-        dirty->root = root;
-        dirty->latest_root = latest;
        mutex_lock(&root->fs_info->trans_mutex);
-        list_add(&dirty->list, &latest->fs_info->dead_roots);
+        list_add(&root->root_list, &root->fs_info->dead_roots);
        mutex_unlock(&root->fs_info->trans_mutex);
        return 0;
 }
 /*
- * at transaction commit time we need to schedule the old roots for
+ * update all the cowonly tree roots on disk
- * deletion via btrfs_drop_snapshot.  This runs through all the
- * reference counted roots that were modified in the current
- * transaction and puts them into the drop list
 */
-static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
-                                    struct radix_tree_root *radix,
+                                    struct btrfs_root *root)
-                                    struct list_head *list)
 {
-        struct btrfs_dirty_root *dirty;
        struct btrfs_root *gang[8];
-        struct btrfs_root *root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
        int i;
        int ret;
        int err = 0;
-        u32 refs;
        while (1) {
-                ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+                                                 (void **)gang, 0,
                                                 ARRAY_SIZE(gang),
                                                 BTRFS_ROOT_TRANS_TAG);
                if (ret == 0)
                        break;
                for (i = 0; i < ret; i++) {
                        root = gang[i];
-                        radix_tree_tag_clear(radix,
+                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
-                                     (unsigned long)root->root_key.objectid,
+                                        (unsigned long)root->root_key.objectid,
-                                     BTRFS_ROOT_TRANS_TAG);
+                                        BTRFS_ROOT_TRANS_TAG);
-                        BUG_ON(!root->ref_tree);
-                        dirty = root->dirty_root;
                        btrfs_free_log(trans, root);
-                        btrfs_free_reloc_root(trans, root);
+                        btrfs_update_reloc_root(trans, root);
-                        if (root->commit_root == root->node) {
-                                WARN_ON(root->node->start !=
-                                        btrfs_root_bytenr(&root->root_item));
-                                free_extent_buffer(root->commit_root);
-                                root->commit_root = NULL;
-                                root->dirty_root = NULL;
-                                spin_lock(&root->list_lock);
-                                list_del_init(&dirty->root->dead_list);
-                                spin_unlock(&root->list_lock);
-                                kfree(dirty->root);
-                                kfree(dirty);
-                                /* make sure to update the root on disk
+                        if (root->commit_root != root->node) {
-                                 * so we get any updates to the block used
+                                switch_commit_root(root);
-                                 * counts
+                                btrfs_set_root_node(&root->root_item,
-                                 */
+                                                    root->node);
-                                err = btrfs_update_root(trans,
-                                                root->fs_info->tree_root,
-                                                &root->root_key,
-                                                &root->root_item);
-                                continue;
                        }
-                        memset(&root->root_item.drop_progress, 0,
+                        err = btrfs_update_root(trans, fs_info->tree_root,
-                               sizeof(struct btrfs_disk_key));
-                        root->root_item.drop_level = 0;
-                        root->commit_root = NULL;
-                        root->dirty_root = NULL;
-                        root->root_key.offset = root->fs_info->generation;
-                        btrfs_set_root_bytenr(&root->root_item,
-                                              root->node->start);
-                        btrfs_set_root_level(&root->root_item,
-                                             btrfs_header_level(root->node));
-                        btrfs_set_root_generation(&root->root_item,
-                                                  root->root_key.offset);
-                        err = btrfs_insert_root(trans, root->fs_info->tree_root,
                                                &root->root_key,
                                                &root->root_item);
                        if (err)
                                break;
-                        refs = btrfs_root_refs(&dirty->root->root_item);
-                        btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
-                        err = btrfs_update_root(trans, root->fs_info->tree_root,
-                                                &dirty->root->root_key,
-                                                &dirty->root->root_item);
-                        BUG_ON(err);
-                        if (refs == 1) {
-                                list_add(&dirty->list, list);
-                        } else {
-                                WARN_ON(1);
-                                free_extent_buffer(dirty->root->node);
-                                kfree(dirty->root);
-                                kfree(dirty);
-                        }
                }
        }
        return err;
@@ -670,6 +598,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
        return 0;
 }
+#if 0
 /*
 * when dropping snapshots, we generate a ton of delayed refs, and it makes
 * sense not to join the transaction while it is trying to flush the current
@@ -688,12 +617,8 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
                                TASK_UNINTERRUPTIBLE);
                mutex_unlock(&info->trans_mutex);
-                atomic_dec(&info->throttles);
-                wake_up(&info->transaction_throttle);
                schedule();
-                atomic_inc(&info->throttles);
                mutex_lock(&info->trans_mutex);
                finish_wait(&info->transaction_wait, &wait);
        }
@@ -705,113 +630,64 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
 * all of them
 */
-static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
+int btrfs_drop_dead_root(struct btrfs_root *root)
-                                     struct list_head *list)
 {
-        struct btrfs_dirty_root *dirty;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
        unsigned long nr;
-        u64 num_bytes;
+        int ret;
-        u64 bytes_used;
-        u64 max_useless;
-        int ret = 0;
-        int err;
-        while (!list_empty(list)) {
-                struct btrfs_root *root;
-                dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
-                list_del_init(&dirty->list);
-                num_bytes = btrfs_root_used(&dirty->root->root_item);
-                root = dirty->latest_root;
-                atomic_inc(&root->fs_info->throttles);
-                while (1) {
-                        /*
-                         * we don't want to jump in and create a bunch of
-                         * delayed refs if the transaction is starting to close
-                         */
-                        wait_transaction_pre_flush(tree_root->fs_info);
-                        trans = btrfs_start_transaction(tree_root, 1);
-                        /*
-                         * we've joined a transaction, make sure it isn't
-                         * closing right now
-                         */
-                        if (trans->transaction->delayed_refs.flushing) {
-                                btrfs_end_transaction(trans, tree_root);
-                                continue;
-                        }
-                        mutex_lock(&root->fs_info->drop_mutex);
-                        ret = btrfs_drop_snapshot(trans, dirty->root);
-                        if (ret != -EAGAIN)
-                                break;
-                        mutex_unlock(&root->fs_info->drop_mutex);
-                        err = btrfs_update_root(trans,
+        while (1) {
-                                        tree_root,
+                /*
-                                        &dirty->root->root_key,
+                 * we don't want to jump in and create a bunch of
-                                        &dirty->root->root_item);
+                 * delayed refs if the transaction is starting to close
-                        if (err)
+                 */
-                                ret = err;
+                wait_transaction_pre_flush(tree_root->fs_info);
-                        nr = trans->blocks_used;
+                trans = btrfs_start_transaction(tree_root, 1);
-                        ret = btrfs_end_transaction(trans, tree_root);
-                        BUG_ON(ret);
-                        btrfs_btree_balance_dirty(tree_root, nr);
+                /*
-                        cond_resched();
+                 * we've joined a transaction, make sure it isn't
+                 * closing right now
+                 */
+                if (trans->transaction->delayed_refs.flushing) {
+                        btrfs_end_transaction(trans, tree_root);
+                        continue;
                }
-                BUG_ON(ret);
-                atomic_dec(&root->fs_info->throttles);
-                wake_up(&root->fs_info->transaction_throttle);
-                num_bytes -= btrfs_root_used(&dirty->root->root_item);
+                ret = btrfs_drop_snapshot(trans, root);
-                bytes_used = btrfs_root_used(&root->root_item);
+                if (ret != -EAGAIN)
-                if (num_bytes) {
+                        break;
-                        mutex_lock(&root->fs_info->trans_mutex);
-                        btrfs_record_root_in_trans(root);
-                        mutex_unlock(&root->fs_info->trans_mutex);
-                        btrfs_set_root_used(&root->root_item,
-                                            bytes_used - num_bytes);
-                }
-                ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
+                ret = btrfs_update_root(trans, tree_root,
-                if (ret) {
+                                        &root->root_key,
-                        BUG();
+                                        &root->root_item);
+                if (ret)
                        break;
-                }
-                mutex_unlock(&root->fs_info->drop_mutex);
-                spin_lock(&root->list_lock);
-                list_del_init(&dirty->root->dead_list);
-                if (!list_empty(&root->dead_list)) {
-                        struct btrfs_root *oldest;
-                        oldest = list_entry(root->dead_list.prev,
-                                            struct btrfs_root, dead_list);
-                        max_useless = oldest->root_key.offset - 1;
-                } else {
-                        max_useless = root->root_key.offset - 1;
-                }
-                spin_unlock(&root->list_lock);
                nr = trans->blocks_used;
                ret = btrfs_end_transaction(trans, tree_root);
                BUG_ON(ret);
-                ret = btrfs_remove_leaf_refs(root, max_useless, 0);
-                BUG_ON(ret);
-                free_extent_buffer(dirty->root->node);
-                kfree(dirty->root);
-                kfree(dirty);
                btrfs_btree_balance_dirty(tree_root, nr);
                cond_resched();
        }
+        BUG_ON(ret);
+        ret = btrfs_del_root(trans, tree_root, &root->root_key);
+        BUG_ON(ret);
+        nr = trans->blocks_used;
+        ret = btrfs_end_transaction(trans, tree_root);
+        BUG_ON(ret);
+        free_extent_buffer(root->node);
+        free_extent_buffer(root->commit_root);
+        kfree(root);
+        btrfs_btree_balance_dirty(tree_root, nr);
        return ret;
 }
+#endif
 /*
 * new snapshots need to be created at a very specific time in the
@@ -839,24 +715,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (ret)
                goto fail;
-        btrfs_record_root_in_trans(root);
+        record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
        key.objectid = objectid;
-        key.offset = trans->transid;
+        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        old = btrfs_lock_root_node(root);
        btrfs_cow_block(trans, root, old, NULL, 0, &old);
+        btrfs_set_lock_blocking(old);
        btrfs_copy_root(trans, root, old, &tmp, objectid);
        btrfs_tree_unlock(old);
        free_extent_buffer(old);
-        btrfs_set_root_bytenr(new_root_item, tmp->start);
+        btrfs_set_root_node(new_root_item, tmp);
-        btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
-        btrfs_set_root_generation(new_root_item, trans->transid);
        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                new_root_item);
        btrfs_tree_unlock(tmp);
@@ -964,6 +839,34 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
        return 0;
 }
+static void update_super_roots(struct btrfs_root *root)
+{
+        struct btrfs_root_item *root_item;
+        struct btrfs_super_block *super;
+        super = &root->fs_info->super_copy;
+        root_item = &root->fs_info->chunk_root->root_item;
+        super->chunk_root = root_item->bytenr;
+        super->chunk_root_generation = root_item->generation;
+        super->chunk_root_level = root_item->level;
+        root_item = &root->fs_info->tree_root->root_item;
+        super->root = root_item->bytenr;
+        super->generation = root_item->generation;
+        super->root_level = root_item->level;
+}
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+        int ret = 0;
+        spin_lock(&info->new_trans_lock);
+        if (info->running_transaction)
+                ret = info->running_transaction->in_commit;
+        spin_unlock(&info->new_trans_lock);
+        return ret;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -971,8 +874,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        unsigned long timeout = 1;
        struct btrfs_transaction *cur_trans;
        struct btrfs_transaction *prev_trans = NULL;
-        struct btrfs_root *chunk_root = root->fs_info->chunk_root;
-        struct list_head dirty_fs_roots;
        struct extent_io_tree *pinned_copy;
        DEFINE_WAIT(wait);
        int ret;
@@ -999,7 +900,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        mutex_lock(&root->fs_info->trans_mutex);
-        INIT_LIST_HEAD(&dirty_fs_roots);
        if (cur_trans->in_commit) {
                cur_trans->use_count++;
                mutex_unlock(&root->fs_info->trans_mutex);
@@ -1058,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->fs_info->trans_mutex);
-                if (flush_on_commit || snap_pending) {
+                if (flush_on_commit) {
-                        if (flush_on_commit)
+                        btrfs_start_delalloc_inodes(root);
-                                btrfs_start_delalloc_inodes(root);
+                        ret = btrfs_wait_ordered_extents(root, 0);
+                        BUG_ON(ret);
+                } else if (snap_pending) {
                        ret = btrfs_wait_ordered_extents(root, 1);
                        BUG_ON(ret);
                }
@@ -1105,41 +1007,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         * with the tree-log code.
         */
        mutex_lock(&root->fs_info->tree_log_mutex);
-        /*
-         * keep tree reloc code from adding new reloc trees
-         */
-        mutex_lock(&root->fs_info->tree_reloc_mutex);
+        ret = commit_fs_roots(trans, root);
-        ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
-                              &dirty_fs_roots);
        BUG_ON(ret);
-        /* add_dirty_roots gets rid of all the tree log roots, it is now
+        /* commit_fs_roots gets rid of all the tree log roots, it is now
         * safe to free the root of tree log roots
         */
        btrfs_free_log_root_tree(trans, root->fs_info);
-        ret = btrfs_commit_tree_roots(trans, root);
+        ret = commit_cowonly_roots(trans, root);
        BUG_ON(ret);
        cur_trans = root->fs_info->running_transaction;
        spin_lock(&root->fs_info->new_trans_lock);
        root->fs_info->running_transaction = NULL;
        spin_unlock(&root->fs_info->new_trans_lock);
-        btrfs_set_super_generation(&root->fs_info->super_copy,
-                                   cur_trans->transid);
+        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
-        btrfs_set_super_root(&root->fs_info->super_copy,
+                            root->fs_info->tree_root->node);
-                             root->fs_info->tree_root->node->start);
+        switch_commit_root(root->fs_info->tree_root);
-        btrfs_set_super_root_level(&root->fs_info->super_copy,
-                           btrfs_header_level(root->fs_info->tree_root->node));
+        btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
+                            root->fs_info->chunk_root->node);
-        btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+        switch_commit_root(root->fs_info->chunk_root);
-                                   chunk_root->node->start);
-        btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+        update_super_roots(root);
-                                         btrfs_header_level(chunk_root->node));
-        btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
-                                btrfs_header_generation(chunk_root->node));
        if (!root->fs_info->log_root_recovering) {
                btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
@@ -1153,7 +1046,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        trans->transaction->blocked = 0;
-        wake_up(&root->fs_info->transaction_throttle);
        wake_up(&root->fs_info->transaction_wait);
        mutex_unlock(&root->fs_info->trans_mutex);
@@ -1170,9 +1062,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root, pinned_copy);
        kfree(pinned_copy);
-        btrfs_drop_dead_reloc_roots(root);
-        mutex_unlock(&root->fs_info->tree_reloc_mutex);
        /* do the directory inserts of any pending snapshot creations */
        finish_pending_snapshots(trans, root->fs_info);
@@ -1181,21 +1070,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
        put_transaction(cur_trans);
        put_transaction(cur_trans);
-        list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
-        if (root->fs_info->closing)
-                list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
        mutex_unlock(&root->fs_info->trans_mutex);
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
-        if (root->fs_info->closing)
-                drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
        return ret;
 }
@@ -1204,16 +1087,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 */
 int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
-        struct list_head dirty_roots;
+        LIST_HEAD(list);
-        INIT_LIST_HEAD(&dirty_roots);
+        struct btrfs_fs_info *fs_info = root->fs_info;
-again:
-        mutex_lock(&root->fs_info->trans_mutex);
+        mutex_lock(&fs_info->trans_mutex);
-        list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+        list_splice_init(&fs_info->dead_roots, &list);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        mutex_unlock(&fs_info->trans_mutex);
-        if (!list_empty(&dirty_roots)) {
+        while (!list_empty(&list)) {
-                drop_dirty_roots(root, &dirty_roots);
+                root = list_entry(list.next, struct btrfs_root, root_list);
-                goto again;
+                list_del_init(&root->root_list);
+                btrfs_drop_snapshot(root, 0);
        }
        return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 94f5bde2b58d..663c67404918 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,12 +62,6 @@ struct btrfs_pending_snapshot {
        struct list_head list;
 };
-struct btrfs_dirty_root {
-        struct list_head list;
-        struct btrfs_root *root;
-        struct btrfs_root *latest_root;
-};
 static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
                                               struct inode *inode)
 {
@@ -100,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root);
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
+int btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_drop_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -108,7 +103,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
-int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
                                        struct extent_io_tree *dirty_pages);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index db5e212e8445..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -430,18 +430,16 @@ no_copy:
 static noinline struct inode *read_one_inode(struct btrfs_root *root,
                                             u64 objectid)
 {
+        struct btrfs_key key;
        struct inode *inode;
-        inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
-        if (inode->i_state & I_NEW) {
-                BTRFS_I(inode)->root = root;
-                BTRFS_I(inode)->location.objectid = objectid;
-                BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-                BTRFS_I(inode)->location.offset = 0;
-                btrfs_read_locked_inode(inode);
-                unlock_new_inode(inode);
-        }
+        key.objectid = objectid;
-        if (is_bad_inode(inode)) {
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(root->fs_info->sb, &key, root);
+        if (IS_ERR(inode)) {
+                inode = NULL;
+        } else if (is_bad_inode(inode)) {
                iput(inode);
                inode = NULL;
        }
@@ -541,6 +539,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                u64 offset;
                unsigned long dest_offset;
                struct btrfs_key ins;
@@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
                ins.type = BTRFS_EXTENT_ITEM_KEY;
+                offset = key->offset - btrfs_file_extent_offset(eb, item);
                if (ins.objectid > 0) {
                        u64 csum_start;
@@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                        if (ret == 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
                                                ins.objectid, ins.offset,
-                                                path->nodes[0]->start,
+                                                0, root->root_key.objectid,
-                                                root->root_key.objectid,
+                                                key->objectid, offset);
-                                                trans->transid, key->objectid);
                        } else {
                                /*
                                 * insert the extent pointer in the extent
                                 * allocation tree
                                 */
-                                ret = btrfs_alloc_logged_extent(trans, root,
+                                ret = btrfs_alloc_logged_file_extent(trans,
-                                                path->nodes[0]->start,
+                                                root, root->root_key.objectid,
-                                                root->root_key.objectid,
+                                                key->objectid, offset, &ins);
-                                                trans->transid, key->objectid,
-                                                &ins);
                                BUG_ON(ret);
                        }
                        btrfs_release_path(root, path);
@@ -800,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                return -ENOENT;
        inode = read_one_inode(root, key->objectid);
-        BUG_ON(!dir);
+        BUG_ON(!inode);
        ref_ptr = btrfs_item_ptr_offset(eb, slot);
        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
@@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
-                                ret = btrfs_drop_leaf_ref(trans, root, next);
-                                BUG_ON(ret);
                                WARN_ON(root_owner !=
                                        BTRFS_TREE_LOG_OBJECTID);
                                ret = btrfs_free_reserved_extent(root,
@@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                btrfs_wait_tree_block_writeback(next);
                btrfs_tree_unlock(next);
-                if (*level == 0) {
-                        ret = btrfs_drop_leaf_ref(trans, root, next);
-                        BUG_ON(ret);
-                }
                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
                ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
                BUG_ON(ret);
@@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
-                                if (*level == 0) {
-                                        ret = btrfs_drop_leaf_ref(trans, root,
-                                                                  next);
-                                        BUG_ON(ret);
-                                }
                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
                                ret = btrfs_free_reserved_extent(root,
                                                path->nodes[*level]->start,
@@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
                        btrfs_wait_tree_block_writeback(next);
                        btrfs_tree_unlock(next);
-                        if (orig_level == 0) {
-                                ret = btrfs_drop_leaf_ref(trans, log,
-                                                          next);
-                                BUG_ON(ret);
-                        }
                        WARN_ON(log->root_key.objectid !=
                                BTRFS_TREE_LOG_OBJECTID);
                        ret = btrfs_free_reserved_extent(log, next->start,
@@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
        BUG_ON(ret);
-        btrfs_set_root_bytenr(&log->root_item, log->node->start);
+        btrfs_set_root_node(&log->root_item, log->node);
-        btrfs_set_root_generation(&log->root_item, trans->transid);
-        btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
        root->log_batch = 0;
        root->log_transid++;
@@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                       ins_keys, ins_sizes, nr);
        BUG_ON(ret);
-        for (i = 0; i < nr; i++) {
+        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
                                                   dst_path->slots[0]);
@@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                        found_type = btrfs_file_extent_type(src, extent);
                        if (found_type == BTRFS_FILE_EXTENT_REG ||
                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-                                u64 ds = btrfs_file_extent_disk_bytenr(src,
+                                u64 ds, dl, cs, cl;
-                                                                   extent);
+                                ds = btrfs_file_extent_disk_bytenr(src,
-                                u64 dl = btrfs_file_extent_disk_num_bytes(src,
+                                                                extent);
-                                                                      extent);
+                                /* ds == 0 is a hole */
-                                u64 cs = btrfs_file_extent_offset(src, extent);
+                                if (ds == 0)
-                                u64 cl = btrfs_file_extent_num_bytes(src,
+                                        continue;
-                                                                     extent);;
+                                dl = btrfs_file_extent_disk_num_bytes(src,
+                                                                extent);
+                                cs = btrfs_file_extent_offset(src, extent);
+                                cl = btrfs_file_extent_num_bytes(src,
+                                                                extent);;
                                if (btrfs_file_extent_compression(src,
                                                                  extent)) {
                                        cs = 0;
                                        cl = dl;
                                }
-                                /* ds == 0 is a hole */
-                                if (ds != 0) {
+                                ret = btrfs_lookup_csums_range(
-                                        ret = btrfs_inc_extent_ref(trans, log,
+                                                log->fs_info->csum_root,
-                                                   ds, dl,
+                                                ds + cs, ds + cs + cl - 1,
-                                                   dst_path->nodes[0]->start,
+                                                &ordered_sums);
-                                                   BTRFS_TREE_LOG_OBJECTID,
+                                BUG_ON(ret);
-                                                   trans->transid,
-                                                   ins_keys[i].objectid);
-                                        BUG_ON(ret);
-                                        ret = btrfs_lookup_csums_range(
-                                                   log->fs_info->csum_root,
-                                                   ds + cs, ds + cs + cl - 1,
-                                                   &ordered_sums);
-                                        BUG_ON(ret);
-                                }
                        }
                }
-                dst_path->slots[0]++;
        }
        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
@@ -3029,9 +3001,7 @@ again:
                BUG_ON(!wc.replay_dest);
                wc.replay_dest->log_root = log;
-                mutex_lock(&fs_info->trans_mutex);
+                btrfs_record_root_in_trans(trans, wc.replay_dest);
-                btrfs_record_root_in_trans(wc.replay_dest);
-                mutex_unlock(&fs_info->trans_mutex);
                ret = walk_log_tree(trans, log, &wc);
                BUG_ON(ret);
@@ -3049,6 +3019,7 @@ again:
                key.offset = found_key.offset - 1;
                wc.replay_dest->log_root = NULL;
                free_extent_buffer(log->node);
+                free_extent_buffer(log->commit_root);
                kfree(log);
                if (found_key.offset == 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a6d35b0054ca..5dbefd11b4af 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -161,8 +161,10 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        int again = 0;
        unsigned long num_run;
        unsigned long num_sync_run;
+        unsigned long batch_run = 0;
        unsigned long limit;
        unsigned long last_waited = 0;
+        int force_reg = 0;
        bdi = blk_get_backing_dev_info(device->bdev);
        fs_info = device->dev_root->fs_info;
@@ -176,19 +178,22 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 loop:
        spin_lock(&device->io_lock);
-        num_run = 0;
 loop_lock:
+        num_run = 0;
        /* take all the bios off the list at once and process them
         * later on (without the lock held).  But, remember the
         * tail and other pointers so the bios can be properly reinserted
         * into the list if we hit congestion
         */
-        if (device->pending_sync_bios.head)
+        if (!force_reg && device->pending_sync_bios.head) {
                pending_bios = &device->pending_sync_bios;
-        else
+                force_reg = 1;
+        } else {
                pending_bios = &device->pending_bios;
+                force_reg = 0;
+        }
        pending = pending_bios->head;
        tail = pending_bios->tail;
@@ -228,10 +233,14 @@ loop_lock:
        while (pending) {
                rmb();
-                if (pending_bios != &device->pending_sync_bios &&
+                /* we want to work on both lists, but do more bios on the
-                    device->pending_sync_bios.head &&
+                 * sync list than the regular list
-                    num_run > 16) {
+                 */
-                        cond_resched();
+                if ((num_run > 32 &&
+                    pending_bios != &device->pending_sync_bios &&
+                    device->pending_sync_bios.head) ||
+                   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+                    device->pending_bios.head)) {
                        spin_lock(&device->io_lock);
                        requeue_list(pending_bios, pending, tail);
                        goto loop_lock;
@@ -249,6 +258,8 @@ loop_lock:
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
                submit_bio(cur->bi_rw, cur);
                num_run++;
+                batch_run++;
                if (bio_sync(cur))
                        num_sync_run++;
@@ -265,7 +276,7 @@ loop_lock:
                 * is now congested.  Back off and let other work structs
                 * run instead
                 */
-                if (pending && bdi_write_congested(bdi) && num_run > 16 &&
+                if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
                    fs_info->fs_devices->open_devices > 1) {
                        struct io_context *ioc;
@@ -366,6 +377,7 @@ static noinline int device_list_add(const char *path,
                memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
                fs_devices->latest_devid = devid;
                fs_devices->latest_trans = found_transid;
+                mutex_init(&fs_devices->device_list_mutex);
                device = NULL;
        } else {
                device = __find_device(&fs_devices->devices, devid,
@@ -392,7 +404,11 @@ static noinline int device_list_add(const char *path,
                        return -ENOMEM;
                }
                INIT_LIST_HEAD(&device->dev_alloc_list);
+                mutex_lock(&fs_devices->device_list_mutex);
                list_add(&device->dev_list, &fs_devices->devices);
+                mutex_unlock(&fs_devices->device_list_mutex);
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
@@ -418,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
        INIT_LIST_HEAD(&fs_devices->devices);
        INIT_LIST_HEAD(&fs_devices->alloc_list);
        INIT_LIST_HEAD(&fs_devices->list);
+        mutex_init(&fs_devices->device_list_mutex);
        fs_devices->latest_devid = orig->latest_devid;
        fs_devices->latest_trans = orig->latest_trans;
        memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+        mutex_lock(&orig->device_list_mutex);
        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
                device = kzalloc(sizeof(*device), GFP_NOFS);
                if (!device)
@@ -443,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
+        mutex_unlock(&orig->device_list_mutex);
        return fs_devices;
 error:
+        mutex_unlock(&orig->device_list_mutex);
        free_fs_devices(fs_devices);
        return ERR_PTR(-ENOMEM);
 }
@@ -455,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
        mutex_lock(&uuid_mutex);
 again:
+        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                if (device->in_fs_metadata)
                        continue;
@@ -474,6 +495,7 @@ again:
                kfree(device->name);
                kfree(device);
        }
+        mutex_unlock(&fs_devices->device_list_mutex);
        if (fs_devices->seed) {
                fs_devices = fs_devices->seed;
@@ -594,6 +616,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                device->in_fs_metadata = 0;
                device->mode = flags;
+                if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+                        fs_devices->rotating = 1;
                fs_devices->open_devices++;
                if (device->writeable) {
                        fs_devices->rw_devices++;
@@ -696,7 +721,8 @@ error:
 */
 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
                                         struct btrfs_device *device,
-                                         u64 num_bytes, u64 *start)
+                                         u64 num_bytes, u64 *start,
+                                         u64 *max_avail)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
@@ -733,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
                goto error;
-        ret = btrfs_previous_item(root, path, 0, key.type);
+        if (ret > 0) {
-        if (ret < 0)
+                ret = btrfs_previous_item(root, path, key.objectid, key.type);
-                goto error;
+                if (ret < 0)
+                        goto error;
+                if (ret > 0)
+                        start_found = 1;
+        }
        l = path->nodes[0];
        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
        while (1) {
@@ -778,6 +808,10 @@ no_more_items:
                        if (last_byte < search_start)
                                last_byte = search_start;
                        hole_size = key.offset - last_byte;
+                        if (hole_size > *max_avail)
+                                *max_avail = hole_size;
                        if (key.offset > last_byte &&
                            hole_size >= num_bytes) {
                                *start = last_byte;
@@ -1121,12 +1155,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                device = NULL;
                devices = &root->fs_info->fs_devices->devices;
+                mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
                list_for_each_entry(tmp, devices, dev_list) {
                        if (tmp->in_fs_metadata && !tmp->bdev) {
                                device = tmp;
                                break;
                        }
                }
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
                bdev = NULL;
                bh = NULL;
                disk_super = NULL;
@@ -1181,7 +1217,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                goto error_brelse;
        device->in_fs_metadata = 0;
+        /*
+         * the device list mutex makes sure that we don't change
+         * the device list while someone else is writing out all
+         * the device supers.
+         */
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        list_del_init(&device->dev_list);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        device->fs_devices->num_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
@@ -1275,6 +1320,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
        seed_devices->opened = 1;
        INIT_LIST_HEAD(&seed_devices->devices);
        INIT_LIST_HEAD(&seed_devices->alloc_list);
+        mutex_init(&seed_devices->device_list_mutex);
        list_splice_init(&fs_devices->devices, &seed_devices->devices);
        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
        list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1400,6 +1446,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        mutex_lock(&root->fs_info->volume_mutex);
        devices = &root->fs_info->fs_devices->devices;
+        /*
+         * we have the volume lock, so we don't need the extra
+         * device list mutex while reading the list here.
+         */
        list_for_each_entry(device, devices, dev_list) {
                if (device->bdev == bdev) {
                        ret = -EEXIST;
@@ -1454,6 +1504,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        }
        device->fs_devices = root->fs_info->fs_devices;
+        /*
+         * we don't want write_supers to jump in here with our device
+         * half setup
+         */
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
        list_add(&device->dev_alloc_list,
                 &root->fs_info->fs_devices->alloc_list);
@@ -1462,6 +1518,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        root->fs_info->fs_devices->rw_devices++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+                root->fs_info->fs_devices->rotating = 1;
        total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
        btrfs_set_super_total_bytes(&root->fs_info->super_copy,
                                    total_bytes + device->total_bytes);
@@ -1469,6 +1528,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
        btrfs_set_super_num_devices(&root->fs_info->super_copy,
                                    total_bytes + 1);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        if (seeding_dev) {
                ret = init_first_rw_device(trans, root, device);
@@ -1570,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
        device->fs_devices->total_rw_bytes += diff;
        device->total_bytes = new_size;
+        device->disk_total_bytes = new_size;
        btrfs_clear_space_info_full(device->dev_root->fs_info);
        return btrfs_update_device(trans, device);
@@ -1671,8 +1732,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        int ret;
        int i;
-        printk(KERN_INFO "btrfs relocating chunk %llu\n",
-               (unsigned long long)chunk_offset);
        root = root->fs_info->chunk_root;
        extent_root = root->fs_info->extent_root;
        em_tree = &root->fs_info->mapping_tree.map_tree;
@@ -1958,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                        goto done;
                if (ret) {
                        ret = 0;
-                        goto done;
+                        break;
                }
                l = path->nodes[0];
@@ -1966,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
                if (key.objectid != device->devid)
-                        goto done;
+                        break;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                length = btrfs_dev_extent_length(l, dev_extent);
@@ -2122,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                             max_chunk_size);
 again:
+        max_avail = 0;
        if (!map || map->num_stripes != num_stripes) {
                kfree(map);
                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2170,7 +2230,8 @@ again:
                if (device->in_fs_metadata && avail >= min_free) {
                        ret = find_free_dev_extent(trans, device,
-                                                   min_free, &dev_offset);
+                                                   min_free, &dev_offset,
+                                                   &max_avail);
                        if (ret == 0) {
                                list_move_tail(&device->dev_alloc_list,
                                               &private_devs);
@@ -2746,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                }
        }
-        for (i = 0; i > nr; i++) {
-                struct btrfs_multi_bio *multi;
-                struct btrfs_bio_stripe *stripe;
-                int ret;
-                length = 1;
-                ret = btrfs_map_block(map_tree, WRITE, buf[i],
-                                      &length, &multi, 0);
-                BUG_ON(ret);
-                stripe = multi->stripes;
-                for (j = 0; j < multi->num_stripes; j++) {
-                        if (stripe->physical >= physical &&
-                            physical < stripe->physical + length)
-                                break;
-                }
-                BUG_ON(j >= multi->num_stripes);
-                kfree(multi);
-        }
        *logical = buf;
        *naddrs = nr;
        *stripe_len = map->stripe_len;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5c3ff6d02fd7..5139a833f721 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -96,7 +96,12 @@ struct btrfs_fs_devices {
        u64 rw_devices;
        u64 total_rw_bytes;
        struct block_device *latest_bdev;
-        /* all of the devices in the FS */
+        /* all of the devices in the FS, protected by a mutex
+         * so we can safely walk it to write out the supers without
+         * worrying about add/remove by the multi-device code
+         */
+        struct mutex device_list_mutex;
        struct list_head devices;
        /* devices not currently being allocated */
@@ -107,6 +112,11 @@ struct btrfs_fs_devices {
        int seeding;
        int opened;
+        /* set when we find or add a device that doesn't have the
+         * nonrot flag set
+         */
+        int rotating;
 };
 struct btrfs_bio_stripe {
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d32..3e2b90eaa239 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        *total_in = 0;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -1;
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        char *kaddr;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -ENOMEM;
        data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
                return -ENOMEM;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -ENOMEM;
        workspace->inf_strm.next_in = data_in;
diff --git a/fs/buffer.c b/fs/buffer.c
index 49106127a4aa..28f320fac4d4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
        /* Size must be multiple of hard sectorsize */
-        if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
                                        size);
-                printk(KERN_ERR "hardsect size: %d\n",
+                printk(KERN_ERR "logical block size: %d\n",
-                                        bdev_hardsect_size(bdev));
+                                        bdev_logical_block_size(bdev));
                dump_stack();
                return NULL;
@@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
        if (!test_set_buffer_dirty(bh)) {
                struct page *page = bh->b_page;
-                if (!TestSetPageDirty(page))
+                if (!TestSetPageDirty(page)) {
-                        __set_page_dirty(page, page_mapping(page), 0);
+                        struct address_space *mapping = page_mapping(page);
+                        if (mapping)
+                                __set_page_dirty(page, mapping, 0);
+                }
        }
 }
@@ -2935,6 +2938,8 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
+        BUG_ON(buffer_delay(bh));
+        BUG_ON(buffer_unwritten(bh));
        /*
         * Mask in barrier bit for a write (could be either a WRITE or a
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1e962348d111..431accd475a7 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,9 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
        /* make sure all pages pinned by operations on behalf of the netfs are
         * written to disc */
        cachefiles_begin_secure(cache, &saved_cred);
-        ret = fsync_super(cache->mnt->mnt_sb);
+        down_read(&cache->mnt->mnt_sb->s_umount);
+        ret = sync_filesystem(cache->mnt->mnt_sb);
+        up_read(&cache->mnt->mnt_sb->s_umount);
        cachefiles_end_secure(cache, saved_cred);
        if (ret == -EIO)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 38f71222a552..a173551e19d7 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -13,7 +13,6 @@
 #include <linux/major.h>
 #include <linux/errno.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/kobject.h>
@@ -375,7 +374,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
-                        inode->i_cindex = idx;
                        list_add(&inode->i_devices, &p->list);
                        new = NULL;
                } else if (!cdev_get(p))
@@ -405,6 +403,18 @@ static int chrdev_open(struct inode *inode, struct file *filp)
        return ret;
 }
+int cdev_index(struct inode *inode)
+{
+        int idx;
+        struct kobject *kobj;
+        kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
+        if (!kobj)
+                return -1;
+        kobject_put(kobj);
+        return idx;
+}
 void cd_forget(struct inode *inode)
 {
        spin_lock(&cdev_lock);
@@ -557,6 +567,7 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(register_chrdev);
 EXPORT_SYMBOL(unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f20c4069c220..e85b1e4389e0 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,23 @@
+Version 1.60
+-------------
+Fix memory leak in reconnect.  Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again.  Add noforcegid
+and noforceuid mount parameters.
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it).  Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden).  Add support for scope mount parm. Improve
+hard link detection to use same inode for both.  Do not set
+read-only dos attribute on directories (for chmod) since Windows
+explorer special cases this attribute bit for directories for
+a different purpose.
 Version 1.58
 ------------
 Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
@@ -10,6 +30,8 @@ we converted from).  Fix endianness of the vcnum field used during
 session setup to distinguish multiple mounts to same server from different
 userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
 flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
 
 Version 1.57
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index db208ddb9899..79c1a93400be 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,10 +262,11 @@ A partial list of the supported mount options follows:
                mount.  
  domain        Set the SMB/CIFS workgroup name prepended to the
                username during CIFS session establishment
-  uid           Set the default uid for inodes. For mounts to servers
+  forceuid      Set the default uid for inodes to the uid
+                passed in on mount. For mounts to servers
                which do support the CIFS Unix extensions, such as a
                properly configured Samba server, the server provides
-                the uid, gid and mode so this parameter should  not be
+                the uid, gid and mode so this parameter should not be
                specified unless the server and clients uid and gid
                numbering differ.  If the server and client are in the
                same domain (e.g. running winbind or nss_ldap) and
@@ -277,11 +278,7 @@ A partial list of the supported mount options follows:
                of existing files will be the uid (gid) of the person
                who executed the mount (root, except when mount.cifs
                is configured setuid for user mounts) unless the "uid=" 
-                (gid) mount option is specified.  For the uid (gid) of newly
+                (gid) mount option is specified. Also note that permission
-                created files and directories, ie files created since 
-                the last mount of the server share, the expected uid 
-                (gid) is cached as long as the inode remains in 
-                memory on the client.   Also note that permission
                checks (authorization checks) on accesses to a file occur
                at the server, but there are cases in which an administrator
                may want to restrict at the client as well.  For those
@@ -289,9 +286,18 @@ A partial list of the supported mount options follows:
                (such as Windows), permissions can also be checked at the
                client, and a crude form of client side permission checking 
                can be enabled by specifying file_mode and dir_mode on 
-                the client.  Note that the mount.cifs helper must be
+                the client.  (default)
-                at version 1.10 or higher to support specifying the uid
+  forcegid      (similar to above but for the groupid instead of uid) (default)
-                (or gid) in non-numeric form.
+  noforceuid    Fill in file owner information (uid) by requesting it from
+                the server if possible. With this option, the value given in
+                the uid= option (on mount) will only be used if the server
+                can not support returning uids on inodes.
+  noforcegid    (similar to above but for the group owner, gid, instead of uid)
+  uid           Set the default uid for inodes, and indicate to the
+                cifs kernel driver which local user mounted. If the server
+                supports the unix extensions the default uid is
+                not used to fill in the owner fields of inodes (files)
+                unless the "forceuid" parameter is specified.
  gid           Set the default gid for inodes (similar to above).
  file_mode     If CIFS Unix extensions are not supported by the server
                this overrides the default mode for file inodes.
@@ -388,8 +394,13 @@ A partial list of the supported mount options follows:
                or the CIFS Unix Extensions equivalent and for those
                this mount option will have no effect.  Exporting cifs mounts
                under nfsd requires this mount option on the cifs mount.
+                This is now the default if server supports the 
+                required network operation.
  noserverino   Client generates inode numbers (rather than using the actual one
-                from the server) by default.
+                from the server). These inode numbers will vary after
+                unmount or reboot which can confuse some applications,
+                but not all server filesystems support unique inode
+                numbers.
  setuids       If the CIFS Unix extensions are negotiated with the server
                the client will attempt to set the effective uid and gid of
                the local process on newly created files, directories, and
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 1b09f1670061..20692fbfdb24 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -49,6 +49,7 @@
 #define ASN1_OJI        6       /* Object Identifier  */
 #define ASN1_OJD        7       /* Object Description */
 #define ASN1_EXT        8       /* External */
+#define ASN1_ENUM       10      /* Enumerated */
 #define ASN1_SEQ        16      /* Sequence */
 #define ASN1_SET        17      /* Set */
 #define ASN1_NUMSTR     18      /* Numerical String */
@@ -78,10 +79,12 @@
 #define SPNEGO_OID_LEN 7
 #define NTLMSSP_OID_LEN  10
 #define KRB5_OID_LEN  7
+#define KRB5U2U_OID_LEN  8
 #define MSKRB5_OID_LEN  7
 static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
 static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
 static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
+static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 };
 static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
 /*
@@ -122,6 +125,28 @@ asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
        return 1;
 }
+#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */
+static unsigned char
+asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
+{
+        unsigned char ch;
+        if (ctx->pointer >= ctx->end) {
+                ctx->error = ASN1_ERR_DEC_EMPTY;
+                return 0;
+        }
+        ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */
+        if ((ch) == ASN1_ENUM)  /* if ch value is ENUM, 0xa */
+                *val = *(++(ctx->pointer)); /* value has enum value */
+        else
+                return 0;
+        ctx->pointer++;
+        return 1;
+}
+#endif
 static unsigned char
 asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
 {
@@ -476,10 +501,9 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        unsigned int cls, con, tag, oidlen, rc;
        bool use_ntlmssp = false;
        bool use_kerberos = false;
+        bool use_kerberosu2u = false;
        bool use_mskerberos = false;
-        *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
        /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
        asn1_open(&ctx, security_blob, length);
@@ -515,6 +539,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* SPNEGO */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding negTokenInit"));
                return 0;
@@ -526,6 +551,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* negTokenInit */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding negTokenInit"));
                return 0;
@@ -537,6 +563,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
                return 0;
@@ -548,6 +575,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* sequence of */
        if (asn1_header_decode
            (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
@@ -560,6 +588,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                return 0;
        }
+        /* list of security mechanisms */
        while (!asn1_eoc_decode(&ctx, sequence_end)) {
                rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
                if (!rc) {
@@ -576,11 +605,15 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                                if (compare_oid(oid, oidlen, MSKRB5_OID,
                                                MSKRB5_OID_LEN) &&
-                                                !use_kerberos)
+                                                !use_mskerberos)
                                        use_mskerberos = true;
+                                else if (compare_oid(oid, oidlen, KRB5U2U_OID,
+                                                     KRB5U2U_OID_LEN) &&
+                                                     !use_kerberosu2u)
+                                        use_kerberosu2u = true;
                                else if (compare_oid(oid, oidlen, KRB5_OID,
                                                     KRB5_OID_LEN) &&
-                                                     !use_mskerberos)
+                                                     !use_kerberos)
                                        use_kerberos = true;
                                else if (compare_oid(oid, oidlen, NTLMSSP_OID,
                                                     NTLMSSP_OID_LEN))
@@ -593,7 +626,12 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                }
        }
+        /* mechlistMIC */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+                /* Check if we have reached the end of the blob, but with
+                   no mechListMic (e.g. NTLMSSP instead of KRB5) */
+                if (ctx.error == ASN1_ERR_DEC_EMPTY)
+                        goto decode_negtoken_exit;
                cFYI(1, ("Error decoding last part negTokenInit exit3"));
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
@@ -602,6 +640,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                         cls, con, tag, end, *end));
                return 0;
        }
+        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding last part negTokenInit exit5"));
                return 0;
@@ -611,6 +651,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                        cls, con, tag, end, *end));
        }
+        /* sequence of */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding last part negTokenInit exit 7"));
                return 0;
@@ -619,6 +660,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                         cls, con, tag, end, *end));
                return 0;
        }
+        /* general string */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding last part negTokenInit exit9"));
                return 0;
@@ -630,13 +673,13 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        }
        cFYI(1, ("Need to call asn1_octets_decode() function for %s",
                 ctx.pointer)); /* is this UTF-8 or ASCII? */
+decode_negtoken_exit:
        if (use_kerberos)
                *secType = Kerberos;
        else if (use_mskerberos)
                *secType = MSKerberos;
        else if (use_ntlmssp)
-                *secType = NTLMSSP;
+                *secType = RawNTLMSSP;
        return 1;
 }
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7f19fefd3d45..42cec2a7c0cf 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -261,6 +261,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                                        atomic_set(&tcon->num_reads, 0);
                                        atomic_set(&tcon->num_oplock_brks, 0);
                                        atomic_set(&tcon->num_opens, 0);
+                                        atomic_set(&tcon->num_posixopens, 0);
+                                        atomic_set(&tcon->num_posixmkdirs, 0);
                                        atomic_set(&tcon->num_closes, 0);
                                        atomic_set(&tcon->num_deletes, 0);
                                        atomic_set(&tcon->num_mkdirs, 0);
@@ -347,11 +349,15 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                                        atomic_read(&tcon->num_locks),
                                        atomic_read(&tcon->num_hardlinks),
                                        atomic_read(&tcon->num_symlinks));
-                                seq_printf(m, "\nOpens: %d Closes: %d"
+                                seq_printf(m, "\nOpens: %d Closes: %d "
                                              "Deletes: %d",
                                        atomic_read(&tcon->num_opens),
                                        atomic_read(&tcon->num_closes),
                                        atomic_read(&tcon->num_deletes));
+                                seq_printf(m, "\nPosix Opens: %d "
+                                              "Posix Mkdirs: %d",
+                                        atomic_read(&tcon->num_posixopens),
+                                        atomic_read(&tcon->num_posixmkdirs));
                                seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
                                        atomic_read(&tcon->num_mkdirs),
                                        atomic_read(&tcon->num_rmdirs));
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 83d62759c7c7..606912d8f2a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
 * i.e. strips from UNC trailing path that is not part of share
 * name and fixup missing '\' in the begining of DFS node refferal
 * if neccessary.
- * Returns pointer to share name on success or NULL on error.
+ * Returns pointer to share name on success or ERR_PTR on error.
 * Caller is responsible for freeing returned string.
 */
 static char *cifs_get_share_name(const char *node_name)
@@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name)
        UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
                         GFP_KERNEL);
        if (!UNC)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        /* get share name and server name */
        if (node_name[1] != '\\') {
@@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name)
                cERROR(1, ("%s: no server name end in node name: %s",
                        __func__, node_name));
                kfree(UNC);
-                return NULL;
+                return ERR_PTR(-EINVAL);
        }
        /* find sharename end */
@@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
                return ERR_PTR(-EINVAL);
        *devname = cifs_get_share_name(ref->node_name);
+        if (IS_ERR(*devname)) {
+                rc = PTR_ERR(*devname);
+                *devname = NULL;
+                goto compose_mount_options_err;
+        }
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
                cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
@@ -275,7 +281,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
        case -EBUSY:
                /* someone else made a mount here whilst we were busy */
                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path.mnt, &nd->path.dentry))
+                       follow_down(&nd->path))
                        ;
                err = 0;
        default:
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 67bf93a40d2e..051caecf7d67 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
+#include <linux/inet.h>
 #include "cifsglob.h"
 #include "cifs_spnego.h"
 #include "cifs_debug.h"
@@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = {
 * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN        13
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
-#define MAX_IPV6_ADDR_LEN       43
 /* strlen of "host=" */
 #define HOST_KEY_LEN            5
@@ -88,6 +86,9 @@ struct key_type cifs_spnego_key_type = {
 /* strlen of ";user=" */
 #define USER_KEY_LEN            6
+/* strlen of ";pid=0x" */
+#define PID_KEY_LEN             7
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -102,10 +103,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
           host=hostname sec=mechanism uid=0xFF user=username */
        desc_len = MAX_VER_STR_LEN +
                   HOST_KEY_LEN + strlen(hostname) +
-                   IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
+                   IP_KEY_LEN + INET6_ADDRSTRLEN +
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
-                   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
+                   USER_KEY_LEN + strlen(sesInfo->userName) +
+                   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
        spnego_key = ERR_PTR(-ENOMEM);
        description = kzalloc(desc_len, GFP_KERNEL);
@@ -143,6 +145,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        sprintf(dp, ";user=%s", sesInfo->userName);
+        dp = description + strlen(description);
+        sprintf(dp, ";pid=0x%x", current->pid);
        cFYI(1, ("key description = %s", description));
        spnego_key = request_key(&cifs_spnego_key_type, description, "");
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 60e3c4253de0..714a542cbafc 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
        int maxwords = maxbytes / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
-        for (i = 0; from[i] && i < maxwords; i++) {
+        for (i = 0; i < maxwords && from[i]; i++) {
                charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
                                             NLS_MAX_CHARSET_SIZE);
                if (charlen > 0)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 57ecdc83c26f..6941c22398a6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -327,7 +327,7 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
 static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
-                       struct inode *inode)
+                       struct cifs_fattr *fattr)
 {
        int i;
        int num_aces = 0;
@@ -340,7 +340,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        if (!pdacl) {
                /* no DACL in the security descriptor, set
                   all the permissions for user/group/other */
-                inode->i_mode |= S_IRWXUGO;
+                fattr->cf_mode |= S_IRWXUGO;
                return;
        }
@@ -357,7 +357,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        /* reset rwx permissions for user/group/other.
           Also, if num_aces is 0 i.e. DACL has no ACEs,
           user/group/other have no permissions */
-        inode->i_mode &= ~(S_IRWXUGO);
+        fattr->cf_mode &= ~(S_IRWXUGO);
        acl_base = (char *)pdacl;
        acl_size = sizeof(struct cifs_acl);
@@ -379,17 +379,17 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                        if (compare_sids(&(ppace[i]->sid), pownersid))
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
-                                                     &(inode->i_mode),
+                                                     &fattr->cf_mode,
                                                     &user_mask);
                        if (compare_sids(&(ppace[i]->sid), pgrpsid))
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
-                                                     &(inode->i_mode),
+                                                     &fattr->cf_mode,
                                                     &group_mask);
                        if (compare_sids(&(ppace[i]->sid), &sid_everyone))
                                access_flags_to_mode(ppace[i]->access_req,
                                                     ppace[i]->type,
-                                                     &(inode->i_mode),
+                                                     &fattr->cf_mode,
                                                     &other_mask);
 /*                      memcpy((void *)(&(cifscred->aces[i])),
@@ -464,7 +464,7 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 /* Convert CIFS ACL to POSIX form */
 static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
-                          struct inode *inode)
+                          struct cifs_fattr *fattr)
 {
        int rc;
        struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
@@ -472,7 +472,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
        char *end_of_acl = ((char *)pntsd) + acl_len;
        __u32 dacloffset;
-        if ((inode == NULL) || (pntsd == NULL))
+        if (pntsd == NULL)
                return -EIO;
        owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
@@ -497,7 +497,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
        if (dacloffset)
                parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
-                           group_sid_ptr, inode);
+                           group_sid_ptr, fattr);
        else
                cFYI(1, ("no ACL")); /* BB grant all or default perms? */
@@ -508,7 +508,6 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
        memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
                        sizeof(struct cifs_sid)); */
        return 0;
 }
@@ -552,134 +551,143 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
        return rc;
 }
+static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
-/* Retrieve an ACL from the server */
+                __u16 fid, u32 *pacllen)
-static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
-                                       const char *path, const __u16 *pfid)
 {
-        struct cifsFileInfo *open_file = NULL;
-        bool unlock_file = false;
-        int xid;
-        int rc = -EIO;
-        __u16 fid;
-        struct super_block *sb;
-        struct cifs_sb_info *cifs_sb;
        struct cifs_ntsd *pntsd = NULL;
+        int xid, rc;
+        xid = GetXid();
+        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+        FreeXid(xid);
-        cFYI(1, ("get mode from ACL for %s", path));
+        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+        return pntsd;
+}
-        if (inode == NULL)
+static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
-                return NULL;
+                const char *path, u32 *pacllen)
+{
+        struct cifs_ntsd *pntsd = NULL;
+        int oplock = 0;
+        int xid, rc;
+        __u16 fid;
        xid = GetXid();
-        if (pfid == NULL)
-                open_file = find_readable_file(CIFS_I(inode));
-        else
-                fid = *pfid;
-        sb = inode->i_sb;
+        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
-        if (sb == NULL) {
+                         &fid, &oplock, NULL, cifs_sb->local_nls,
-                FreeXid(xid);
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                return NULL;
+        if (rc) {
-        }
+                cERROR(1, ("Unable to open file to get ACL"));
-        cifs_sb = CIFS_SB(sb);
+                goto out;
-        if (open_file) {
-                unlock_file = true;
-                fid = open_file->netfid;
-        } else if (pfid == NULL) {
-                int oplock = 0;
-                /* open file */
-                rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-                                READ_CONTROL, 0, &fid, &oplock, NULL,
-                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc != 0) {
-                        cERROR(1, ("Unable to open file to get ACL"));
-                        FreeXid(xid);
-                        return NULL;
-                }
        }
        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-        if (unlock_file == true) /* find_readable_file increments ref count */
-                atomic_dec(&open_file->wrtPending);
-        else if (pfid == NULL) /* if opened above we have to close the handle */
-                CIFSSMBClose(xid, cifs_sb->tcon, fid);
-        /* else handle was passed in by caller */
+        CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
        FreeXid(xid);
        return pntsd;
 }
-/* Set an ACL on the server */
+/* Retrieve an ACL from the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
-                                struct inode *inode, const char *path)
+                                      struct inode *inode, const char *path,
+                                      u32 *pacllen)
 {
-        struct cifsFileInfo *open_file;
+        struct cifs_ntsd *pntsd = NULL;
-        bool unlock_file = false;
+        struct cifsFileInfo *open_file = NULL;
-        int xid;
-        int rc = -EIO;
-        __u16 fid;
-        struct super_block *sb;
-        struct cifs_sb_info *cifs_sb;
-        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+        if (inode)
+                open_file = find_readable_file(CIFS_I(inode));
+        if (!open_file)
+                return get_cifs_acl_by_path(cifs_sb, path, pacllen);
-        if (!inode)
+        pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
-                return rc;
+        atomic_dec(&open_file->wrtPending);
+        return pntsd;
+}
-        sb = inode->i_sb;
+static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
-        if (sb == NULL)
+                struct cifs_ntsd *pnntsd, u32 acllen)
-                return rc;
+{
+        int xid, rc;
-        cifs_sb = CIFS_SB(sb);
        xid = GetXid();
+        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+        FreeXid(xid);
-        open_file = find_readable_file(CIFS_I(inode));
+        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-        if (open_file) {
+        return rc;
-                unlock_file = true;
+}
-                fid = open_file->netfid;
-        } else {
+static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
-                int oplock = 0;
+                struct cifs_ntsd *pnntsd, u32 acllen)
-                /* open file */
+{
-                rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
+        int oplock = 0;
-                                WRITE_DAC, 0, &fid, &oplock, NULL,
+        int xid, rc;
-                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+        __u16 fid;
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc != 0) {
+        xid = GetXid();
-                        cERROR(1, ("Unable to open file to set ACL"));
-                        FreeXid(xid);
+        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
-                        return rc;
+                         &fid, &oplock, NULL, cifs_sb->local_nls,
-                }
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc) {
+                cERROR(1, ("Unable to open file to set ACL"));
+                goto out;
        }
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-        if (unlock_file)
-                atomic_dec(&open_file->wrtPending);
-        else
-                CIFSSMBClose(xid, cifs_sb->tcon, fid);
+        CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
        FreeXid(xid);
+        return rc;
+}
+/* Set an ACL on the server */
+static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+                                struct inode *inode, const char *path)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsFileInfo *open_file;
+        int rc;
+        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+        open_file = find_readable_file(CIFS_I(inode));
+        if (!open_file)
+                return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
+        rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
+        atomic_dec(&open_file->wrtPending);
        return rc;
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid)
+void
+cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+                  struct inode *inode, const char *path, const __u16 *pfid)
 {
        struct cifs_ntsd *pntsd = NULL;
        u32 acllen = 0;
        int rc = 0;
        cFYI(DBG2, ("converting ACL to mode for %s", path));
-        pntsd = get_cifs_acl(&acllen, inode, path, pfid);
+        if (pfid)
+                pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
+        else
+                pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
        if (pntsd)
-                rc = parse_sec_desc(pntsd, acllen, inode);
+                rc = parse_sec_desc(pntsd, acllen, fattr);
        if (rc)
                cFYI(1, ("parse sec desc failed rc = %d", rc));
@@ -698,7 +706,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        cFYI(DBG2, ("set ACL from mode for %s", path));
        /* Get the security descriptor */
-        pntsd = get_cifs_acl(&secdesclen, inode, path, NULL);
+        pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
        /* Add three ACEs for owner, group, everyone getting rid of
           other ACEs as chmod disables ACEs and set the security descriptor */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5e6d35804d73..84b75253b05a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #endif
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
-        inode = cifs_iget(sb, ROOT_I);
+        inode = cifs_root_iget(sb, ROOT_I);
        if (IS_ERR(inode)) {
                rc = PTR_ERR(inode);
@@ -204,6 +204,9 @@ cifs_put_super(struct super_block *sb)
                cFYI(1, ("Empty cifs superblock info passed to unmount"));
                return;
        }
+        lock_kernel();
        rc = cifs_umount(sb, cifs_sb);
        if (rc)
                cERROR(1, ("cifs_umount failed with return code %d", rc));
@@ -216,7 +219,8 @@ cifs_put_super(struct super_block *sb)
        unload_nls(cifs_sb->local_nls);
        kfree(cifs_sb);
-        return;
+        unlock_kernel();
 }
 static int
@@ -304,7 +308,6 @@ cifs_alloc_inode(struct super_block *sb)
        if (!cifs_inode)
                return NULL;
        cifs_inode->cifsAttrs = 0x20;   /* default */
-        atomic_set(&cifs_inode->inUse, 0);
        cifs_inode->time = 0;
        cifs_inode->write_behind_rc = 0;
        /* Until the file is open and we have gotten oplock
@@ -329,6 +332,27 @@ cifs_destroy_inode(struct inode *inode)
        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
 }
+static void
+cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
+{
+        seq_printf(s, ",addr=");
+        switch (server->addr.sockAddr.sin_family) {
+        case AF_INET:
+                seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr);
+                break;
+        case AF_INET6:
+                seq_printf(s, "%pI6",
+                           &server->addr.sockAddr6.sin6_addr.s6_addr);
+                if (server->addr.sockAddr6.sin6_scope_id)
+                        seq_printf(s, "%%%u",
+                                   server->addr.sockAddr6.sin6_scope_id);
+                break;
+        default:
+                seq_printf(s, "(unknown)");
+        }
+}
 /*
 * cifs_show_options() is for displaying mount options in /proc/mounts.
 * Not all settable options are displayed but most of the important
@@ -339,83 +363,68 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *tcon;
-        struct TCP_Server_Info *server;
        cifs_sb = CIFS_SB(m->mnt_sb);
+        tcon = cifs_sb->tcon;
-        if (cifs_sb) {
+        seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
-                tcon = cifs_sb->tcon;
+        if (tcon->ses->userName)
-                if (tcon) {
+                seq_printf(s, ",username=%s", tcon->ses->userName);
-                        seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
+        if (tcon->ses->domainName)
-                        if (tcon->ses) {
+                seq_printf(s, ",domain=%s", tcon->ses->domainName);
-                                if (tcon->ses->userName)
-                                        seq_printf(s, ",username=%s",
+        seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
-                                           tcon->ses->userName);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
-                                if (tcon->ses->domainName)
+                seq_printf(s, ",forceuid");
-                                        seq_printf(s, ",domain=%s",
+        else
-                                           tcon->ses->domainName);
+                seq_printf(s, ",noforceuid");
-                                server = tcon->ses->server;
-                                if (server) {
+        seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
-                                        seq_printf(s, ",addr=");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
-                                        switch (server->addr.sockAddr6.
+                seq_printf(s, ",forcegid");
-                                                sin6_family) {
+        else
-                                        case AF_INET6:
+                seq_printf(s, ",noforcegid");
-                                                seq_printf(s, "%pI6",
-                                                           &server->addr.sockAddr6.sin6_addr);
+        cifs_show_address(s, tcon->ses->server);
-                                                break;
-                                        case AF_INET:
+        if (!tcon->unix_ext)
-                                                seq_printf(s, "%pI4",
+                seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
-                                                           &server->addr.sockAddr.sin_addr.s_addr);
-                                                break;
-                                        }
-                                }
-                        }
-                        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) ||
-                           !(tcon->unix_ext))
-                                seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
-                        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
-                           !(tcon->unix_ext))
-                                seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
-                        if (!tcon->unix_ext) {
-                                seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
                                           cifs_sb->mnt_file_mode,
                                           cifs_sb->mnt_dir_mode);
-                        }
+        if (tcon->seal)
-                        if (tcon->seal)
+                seq_printf(s, ",seal");
-                                seq_printf(s, ",seal");
+        if (tcon->nocase)
-                        if (tcon->nocase)
+                seq_printf(s, ",nocase");
-                                seq_printf(s, ",nocase");
+        if (tcon->retry)
-                        if (tcon->retry)
+                seq_printf(s, ",hard");
-                                seq_printf(s, ",hard");
+        if (cifs_sb->prepath)
-                }
+                seq_printf(s, ",prepath=%s", cifs_sb->prepath);
-                if (cifs_sb->prepath)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
-                        seq_printf(s, ",prepath=%s", cifs_sb->prepath);
+                seq_printf(s, ",posixpaths");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
-                        seq_printf(s, ",posixpaths");
+                seq_printf(s, ",setuids");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-                        seq_printf(s, ",setuids");
+                seq_printf(s, ",serverino");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
-                        seq_printf(s, ",serverino");
+                seq_printf(s, ",directio");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
-                        seq_printf(s, ",directio");
+                seq_printf(s, ",nouser_xattr");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
-                        seq_printf(s, ",nouser_xattr");
+                seq_printf(s, ",mapchars");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
-                        seq_printf(s, ",mapchars");
+                seq_printf(s, ",sfu");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                        seq_printf(s, ",sfu");
+                seq_printf(s, ",nobrl");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
-                        seq_printf(s, ",nobrl");
+                seq_printf(s, ",cifsacl");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
-                        seq_printf(s, ",cifsacl");
+                seq_printf(s, ",dynperm");
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+        if (m->mnt_sb->s_flags & MS_POSIXACL)
-                        seq_printf(s, ",dynperm");
+                seq_printf(s, ",acl");
-                if (m->mnt_sb->s_flags & MS_POSIXACL)
-                        seq_printf(s, ",acl");
+        seq_printf(s, ",rsize=%d", cifs_sb->rsize);
+        seq_printf(s, ",wsize=%d", cifs_sb->wsize);
-                seq_printf(s, ",rsize=%d", cifs_sb->rsize);
-                seq_printf(s, ",wsize=%d", cifs_sb->wsize);
-        }
        return 0;
 }
@@ -531,9 +540,14 @@ static void cifs_umount_begin(struct super_block *sb)
        if (tcon == NULL)
                return;
-        lock_kernel();
        read_lock(&cifs_tcp_ses_lock);
-        if (tcon->tc_count == 1)
+        if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
+                /* we have other mounts to same share or we have
+                   already tried to force umount this and woken up
+                   all waiting network requests, nothing to do */
+                read_unlock(&cifs_tcp_ses_lock);
+                return;
+        } else if (tcon->tc_count == 1)
                tcon->tidStatus = CifsExiting;
        read_unlock(&cifs_tcp_ses_lock);
@@ -548,9 +562,7 @@ static void cifs_umount_begin(struct super_block *sb)
                wake_up_all(&tcon->ses->server->response_q);
                msleep(1);
        }
-/* BB FIXME - finish add checks for tidStatus BB */
-        unlock_kernel();
        return;
 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 051b71cfdea9..6c170948300d 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -24,6 +24,19 @@
 #define ROOT_I 2
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
+ */
+static inline ino_t
+cifs_uniqueid_to_ino_t(u64 fileid)
+{
+        ino_t ino = (ino_t) fileid;
+        if (sizeof(ino_t) < sizeof(u64))
+                ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8;
+        return ino;
+}
 extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
@@ -36,7 +49,7 @@ extern void cifs_read_inode(struct inode *);
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
 extern int cifs_create(struct inode *, struct dentry *, int,
                       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
@@ -100,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.58"
+#define CIFS_VERSION   "1.60"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a61ab772c6f6..6084d6379c03 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -83,7 +83,7 @@ enum securityEnum {
        NTLM,                   /* Legacy NTLM012 auth with NTLM hash */
        NTLMv2,                 /* Legacy NTLM auth with NTLMv2 hash */
        RawNTLMSSP,             /* NTLMSSP without SPNEGO, NTLMv2 hash */
-        NTLMSSP,                /* NTLMSSP via SPNEGO, NTLMv2 hash */
+/*      NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
        Kerberos,               /* Kerberos via SPNEGO */
        MSKerberos,             /* MS Kerberos via SPNEGO */
 };
@@ -260,6 +260,8 @@ struct cifsTconInfo {
        atomic_t num_closes;
        atomic_t num_deletes;
        atomic_t num_mkdirs;
+        atomic_t num_posixopens;
+        atomic_t num_posixmkdirs;
        atomic_t num_rmdirs;
        atomic_t num_renames;
        atomic_t num_t2renames;
@@ -364,13 +366,13 @@ struct cifsInodeInfo {
        struct list_head openFileList;
        int write_behind_rc;
        __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
-        atomic_t inUse;  /* num concurrent users (local openers cifs) of file*/
        unsigned long time;     /* jiffies of last update/check of inode */
        bool clientCanCacheRead:1;      /* read oplock */
        bool clientCanCacheAll:1;       /* read and writebehind oplock */
        bool oplockPending:1;
        bool delete_pending:1;          /* DELETE_ON_CLOSE is set */
        u64  server_eof;                /* current file size on server */
+        u64  uniqueid;                  /* server inode number */
        struct inode vfs_inode;
 };
@@ -472,6 +474,32 @@ struct dfs_info3_param {
        char *node_name;
 };
+/*
+ * common struct for holding inode info when searching for or updating an
+ * inode with new info
+ */
+#define CIFS_FATTR_DFS_REFERRAL         0x1
+#define CIFS_FATTR_DELETE_PENDING       0x2
+#define CIFS_FATTR_NEED_REVAL           0x4
+struct cifs_fattr {
+        u32             cf_flags;
+        u32             cf_cifsattrs;
+        u64             cf_uniqueid;
+        u64             cf_eof;
+        u64             cf_bytes;
+        uid_t           cf_uid;
+        gid_t           cf_gid;
+        umode_t         cf_mode;
+        dev_t           cf_rdev;
+        unsigned int    cf_nlink;
+        unsigned int    cf_dtype;
+        struct timespec cf_atime;
+        struct timespec cf_mtime;
+        struct timespec cf_ctime;
+};
 static inline void free_dfs_info_param(struct dfs_info3_param *param)
 {
        if (param) {
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index a785f69dbc9f..2d07f890a842 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2328,19 +2328,7 @@ struct file_attrib_tag {
 typedef struct {
        __le32 NextEntryOffset;
        __u32 ResumeKey; /* as with FileIndex - no need to convert */
-        __le64 EndOfFile;
+        FILE_UNIX_BASIC_INFO basic;
-        __le64 NumOfBytes;
-        __le64 LastStatusChange; /*SNIA specs DCE time for the 3 time fields */
-        __le64 LastAccessTime;
-        __le64 LastModificationTime;
-        __le64 Uid;
-        __le64 Gid;
-        __le32 Type;
-        __le64 DevMajor;
-        __le64 DevMinor;
-        __le64 UniqueId;
-        __le64 Permissions;
-        __le64 Nlinks;
        char FileName[1];
 } __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fae083930eee..da8fbf565991 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -74,7 +74,7 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
                        enum securityEnum *secType);
-extern int cifs_inet_pton(const int, const char *source, void *dst);
+extern int cifs_convert_address(char *src, void *dst);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
                            const struct cifsTconInfo *, int /* length of
@@ -90,17 +90,21 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
                                                 struct cifsTconInfo *);
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
-extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601);
+extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
-extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
-extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+                                      int offset);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
                           struct super_block *sb, int mode, int oflags,
                           int *poplock, __u16 *pnetfid, int xid);
-extern void posix_fill_in_inode(struct inode *tmp_inode,
+extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
-                                FILE_UNIX_BASIC_INFO *pData, int isNewInode);
+                                     FILE_UNIX_BASIC_INFO *info,
-extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
+                                     struct cifs_sb_info *cifs_sb);
+extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
+extern struct inode *cifs_iget(struct super_block *sb,
+                               struct cifs_fattr *fattr);
 extern int cifs_get_inode_info(struct inode **pinode,
                        const unsigned char *search_path,
                        FILE_ALL_INFO *pfile_info,
@@ -108,8 +112,9 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void acl_to_uid_mode(struct inode *inode, const char *path,
+extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
-                            const __u16 *pfid);
+                              struct cifs_fattr *fattr, struct inode *inode,
+                              const char *path, const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
@@ -215,7 +220,11 @@ struct cifs_unix_set_info_args {
        dev_t   device;
 };
-extern int CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *pTcon,
+extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+                                  const struct cifs_unix_set_info_args *args,
+                                  u16 fid, u32 pid_of_opener);
+extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *pTcon,
                        char *fileName,
                        const struct cifs_unix_set_info_args *args,
                        const struct nls_table *nls_codepage,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d06260251c30..1866bc2927d4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        int val, seconds, remain, result;
                        struct timespec ts, utc;
                        utc = CURRENT_TIME;
-                        ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
+                        ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
-                                                le16_to_cpu(rsp->SrvTime.Time));
+                                            rsp->SrvTime.Time, 0);
                        cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
                                (int)ts.tv_sec, (int)utc.tv_sec,
                                (int)(utc.tv_sec - ts.tv_sec)));
@@ -594,7 +594,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        else if (secFlags & CIFSSEC_MAY_KRB5)
                server->secType = Kerberos;
        else if (secFlags & CIFSSEC_MAY_NTLMSSP)
-                server->secType = NTLMSSP;
+                server->secType = RawNTLMSSP;
        else if (secFlags & CIFSSEC_MAY_LANMAN)
                server->secType = LANMAN;
 /* #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -729,7 +729,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
         * the tcon is no longer on the list, so no need to take lock before
         * checking this.
         */
-        if (tcon->need_reconnect)
+        if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
                return 0;
        rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
@@ -1113,7 +1113,10 @@ PsxCreat:
 psx_create_err:
        cifs_buf_release(pSMB);
-        cifs_stats_inc(&tcon->num_mkdirs);
+        if (posix_flags & SMB_O_DIRECTORY)
+                cifs_stats_inc(&tcon->num_posixmkdirs);
+        else
+                cifs_stats_inc(&tcon->num_posixopens);
        if (rc == -EAGAIN)
                goto PsxCreat;
@@ -2427,8 +2430,7 @@ querySymLinkRetry:
        params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
        pSMB->TotalDataCount = 0;
        pSMB->MaxParameterCount = cpu_to_le16(2);
-        /* BB find exact max data count below from sess structure BB */
+        pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
-        pSMB->MaxDataCount = cpu_to_le16(4000);
        pSMB->MaxSetupCount = 0;
        pSMB->Reserved = 0;
        pSMB->Flags = 0;
@@ -5075,10 +5077,114 @@ SetAttrLgcyRetry:
 }
 #endif /* temporarily unneeded SetAttr legacy function */
+static void
+cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
+                        const struct cifs_unix_set_info_args *args)
+{
+        u64 mode = args->mode;
+        /*
+         * Samba server ignores set of file size to zero due to bugs in some
+         * older clients, but we should be precise - we use SetFileSize to
+         * set file size and do not want to truncate file size to zero
+         * accidently as happened on one Samba server beta by putting
+         * zero instead of -1 here
+         */
+        data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
+        data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
+        data_offset->LastStatusChange = cpu_to_le64(args->ctime);
+        data_offset->LastAccessTime = cpu_to_le64(args->atime);
+        data_offset->LastModificationTime = cpu_to_le64(args->mtime);
+        data_offset->Uid = cpu_to_le64(args->uid);
+        data_offset->Gid = cpu_to_le64(args->gid);
+        /* better to leave device as zero when it is  */
+        data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
+        data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
+        data_offset->Permissions = cpu_to_le64(mode);
+        if (S_ISREG(mode))
+                data_offset->Type = cpu_to_le32(UNIX_FILE);
+        else if (S_ISDIR(mode))
+                data_offset->Type = cpu_to_le32(UNIX_DIR);
+        else if (S_ISLNK(mode))
+                data_offset->Type = cpu_to_le32(UNIX_SYMLINK);
+        else if (S_ISCHR(mode))
+                data_offset->Type = cpu_to_le32(UNIX_CHARDEV);
+        else if (S_ISBLK(mode))
+                data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV);
+        else if (S_ISFIFO(mode))
+                data_offset->Type = cpu_to_le32(UNIX_FIFO);
+        else if (S_ISSOCK(mode))
+                data_offset->Type = cpu_to_le32(UNIX_SOCKET);
+}
 int
-CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
+CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
-                   const struct cifs_unix_set_info_args *args,
+                       const struct cifs_unix_set_info_args *args,
-                   const struct nls_table *nls_codepage, int remap)
+                       u16 fid, u32 pid_of_opener)
+{
+        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+        FILE_UNIX_BASIC_INFO *data_offset;
+        int rc = 0;
+        u16 params, param_offset, offset, byte_count, count;
+        cFYI(1, ("Set Unix Info (via SetFileInfo)"));
+        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+        if (rc)
+                return rc;
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+        params = 6;
+        pSMB->MaxSetupCount = 0;
+        pSMB->Reserved = 0;
+        pSMB->Flags = 0;
+        pSMB->Timeout = 0;
+        pSMB->Reserved2 = 0;
+        param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+        offset = param_offset + params;
+        data_offset = (FILE_UNIX_BASIC_INFO *)
+                                ((char *)(&pSMB->hdr.Protocol) + offset);
+        count = sizeof(FILE_UNIX_BASIC_INFO);
+        pSMB->MaxParameterCount = cpu_to_le16(2);
+        /* BB find max SMB PDU from sess */
+        pSMB->MaxDataCount = cpu_to_le16(1000);
+        pSMB->SetupCount = 1;
+        pSMB->Reserved3 = 0;
+        pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+        byte_count = 3 /* pad */  + params + count;
+        pSMB->DataCount = cpu_to_le16(count);
+        pSMB->ParameterCount = cpu_to_le16(params);
+        pSMB->TotalDataCount = pSMB->DataCount;
+        pSMB->TotalParameterCount = pSMB->ParameterCount;
+        pSMB->ParameterOffset = cpu_to_le16(param_offset);
+        pSMB->DataOffset = cpu_to_le16(offset);
+        pSMB->Fid = fid;
+        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
+        pSMB->Reserved4 = 0;
+        pSMB->hdr.smb_buf_length += byte_count;
+        pSMB->ByteCount = cpu_to_le16(byte_count);
+        cifs_fill_unix_set_info(data_offset, args);
+        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+        if (rc)
+                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+        /* Note: On -EAGAIN error only caller can retry on handle based calls
+                since file handle passed in no longer valid */
+        return rc;
+}
+int
+CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
+                       const struct cifs_unix_set_info_args *args,
+                       const struct nls_table *nls_codepage, int remap)
 {
        TRANSACTION2_SPI_REQ *pSMB = NULL;
        TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -5087,7 +5193,6 @@ CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
        int bytes_returned = 0;
        FILE_UNIX_BASIC_INFO *data_offset;
        __u16 params, param_offset, offset, count, byte_count;
-        __u64 mode = args->mode;
        cFYI(1, ("In SetUID/GID/Mode"));
 setPermsRetry:
@@ -5138,38 +5243,8 @@ setPermsRetry:
        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
        pSMB->Reserved4 = 0;
        pSMB->hdr.smb_buf_length += byte_count;
-        /* Samba server ignores set of file size to zero due to bugs in some
-        older clients, but we should be precise - we use SetFileSize to
-        set file size and do not want to truncate file size to zero
-        accidently as happened on one Samba server beta by putting
-        zero instead of -1 here */
-        data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
-        data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
-        data_offset->LastStatusChange = cpu_to_le64(args->ctime);
-        data_offset->LastAccessTime = cpu_to_le64(args->atime);
-        data_offset->LastModificationTime = cpu_to_le64(args->mtime);
-        data_offset->Uid = cpu_to_le64(args->uid);
-        data_offset->Gid = cpu_to_le64(args->gid);
-        /* better to leave device as zero when it is  */
-        data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
-        data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
-        data_offset->Permissions = cpu_to_le64(mode);
-        if (S_ISREG(mode))
-                data_offset->Type = cpu_to_le32(UNIX_FILE);
-        else if (S_ISDIR(mode))
-                data_offset->Type = cpu_to_le32(UNIX_DIR);
-        else if (S_ISLNK(mode))
-                data_offset->Type = cpu_to_le32(UNIX_SYMLINK);
-        else if (S_ISCHR(mode))
-                data_offset->Type = cpu_to_le32(UNIX_CHARDEV);
-        else if (S_ISBLK(mode))
-                data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV);
-        else if (S_ISFIFO(mode))
-                data_offset->Type = cpu_to_le32(UNIX_FIFO);
-        else if (S_ISSOCK(mode))
-                data_offset->Type = cpu_to_le32(UNIX_SOCKET);
+        cifs_fill_unix_set_info(data_offset, args);
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4aa81a507b74..1f3345d7fa79 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <linux/inet.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -61,7 +62,6 @@ struct smb_vol {
        char *domainname;
        char *UNC;
        char *UNCip;
-        char *in6_addr;   /* ipv6 address as human readable form of in6_addr */
        char *iocharset;  /* local code page for mapping to and from Unicode */
        char source_rfc1001_name[16]; /* netbios name of client */
        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -70,7 +70,6 @@ struct smb_vol {
        mode_t file_mode;
        mode_t dir_mode;
        unsigned secFlg;
-        bool rw:1;
        bool retry:1;
        bool intr:1;
        bool setuids:1;
@@ -804,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname,
        char *data;
        unsigned int  temp_len, i, j;
        char separator[2];
+        short int override_uid = -1;
+        short int override_gid = -1;
+        bool uid_specified = false;
+        bool gid_specified = false;
        separator[0] = ',';
        separator[1] = 0;
@@ -827,14 +830,15 @@ cifs_parse_mount_options(char *options, const char *devname,
        vol->target_rfc1001_name[0] = 0;
        vol->linux_uid = current_uid();  /* use current_euid() instead? */
        vol->linux_gid = current_gid();
-        vol->dir_mode = S_IRWXUGO;
-        /* 2767 perms indicate mandatory locking support */
+        /* default to only allowing write access to owner of the mount */
-        vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
+        vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
        /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
-        vol->rw = true;
        /* default is always to request posix paths. */
        vol->posix_paths = 1;
+        /* default to using server inode numbers where available */
+        vol->server_ino = 1;
        if (!options)
                return 1;
@@ -955,10 +959,12 @@ cifs_parse_mount_options(char *options, const char *devname,
                                }
                                strcpy(vol->password, value);
                        }
-                } else if (strnicmp(data, "ip", 2) == 0) {
+                } else if (!strnicmp(data, "ip", 2) ||
+                           !strnicmp(data, "addr", 4)) {
                        if (!value || !*value) {
                                vol->UNCip = NULL;
-                        } else if (strnlen(value, 35) < 35) {
+                        } else if (strnlen(value, INET6_ADDRSTRLEN) <
+                                                        INET6_ADDRSTRLEN) {
                                vol->UNCip = value;
                        } else {
                                printk(KERN_WARNING "CIFS: ip address "
@@ -1091,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname,
                                                    "too long.\n");
                                return 1;
                        }
-                } else if (strnicmp(data, "uid", 3) == 0) {
+                } else if (!strnicmp(data, "uid", 3) && value && *value) {
-                        if (value && *value) {
+                        vol->linux_uid = simple_strtoul(value, &value, 0);
-                                vol->linux_uid =
+                        uid_specified = true;
-                                        simple_strtoul(value, &value, 0);
+                } else if (!strnicmp(data, "forceuid", 8)) {
-                                vol->override_uid = 1;
+                        override_uid = 1;
-                        }
+                } else if (!strnicmp(data, "noforceuid", 10)) {
-                } else if (strnicmp(data, "gid", 3) == 0) {
+                        override_uid = 0;
-                        if (value && *value) {
+                } else if (!strnicmp(data, "gid", 3) && value && *value) {
-                                vol->linux_gid =
+                        vol->linux_gid = simple_strtoul(value, &value, 0);
-                                        simple_strtoul(value, &value, 0);
+                        gid_specified = true;
-                                vol->override_gid = 1;
+                } else if (!strnicmp(data, "forcegid", 8)) {
-                        }
+                        override_gid = 1;
+                } else if (!strnicmp(data, "noforcegid", 10)) {
+                        override_gid = 0;
                } else if (strnicmp(data, "file_mode", 4) == 0) {
                        if (value && *value) {
                                vol->file_mode =
@@ -1195,7 +1203,9 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if (strnicmp(data, "guest", 5) == 0) {
                        /* ignore */
                } else if (strnicmp(data, "rw", 2) == 0) {
-                        vol->rw = true;
+                        /* ignore */
+                } else if (strnicmp(data, "ro", 2) == 0) {
+                        /* ignore */
                } else if (strnicmp(data, "noblocksend", 11) == 0) {
                        vol->noblocksnd = 1;
                } else if (strnicmp(data, "noautotune", 10) == 0) {
@@ -1214,8 +1224,6 @@ cifs_parse_mount_options(char *options, const char *devname,
                            parse these options again and set anything and it
                            is ok to just ignore them */
                        continue;
-                } else if (strnicmp(data, "ro", 2) == 0) {
-                        vol->rw = false;
                } else if (strnicmp(data, "hard", 4) == 0) {
                        vol->retry = 1;
                } else if (strnicmp(data, "soft", 4) == 0) {
@@ -1315,16 +1323,6 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->direct_io = 1;
                } else if (strnicmp(data, "forcedirectio", 13) == 0) {
                        vol->direct_io = 1;
-                } else if (strnicmp(data, "in6_addr", 8) == 0) {
-                        if (!value || !*value) {
-                                vol->in6_addr = NULL;
-                        } else if (strnlen(value, 49) == 48) {
-                                vol->in6_addr = value;
-                        } else {
-                                printk(KERN_WARNING "CIFS: ip v6 address not "
-                                                    "48 characters long\n");
-                                return 1;
-                        }
                } else if (strnicmp(data, "noac", 4) == 0) {
                        printk(KERN_WARNING "CIFS: Mount option noac not "
                                "supported. Instead set "
@@ -1363,6 +1361,18 @@ cifs_parse_mount_options(char *options, const char *devname,
        if (vol->UNCip == NULL)
                vol->UNCip = &vol->UNC[2];
+        if (uid_specified)
+                vol->override_uid = override_uid;
+        else if (override_uid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
+                                   "specified with no uid= option.\n");
+        if (gid_specified)
+                vol->override_gid = override_gid;
+        else if (override_gid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
+                                   "specified with no gid= option.\n");
        return 0;
 }
@@ -1392,8 +1402,10 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
                     server->addr.sockAddr.sin_addr.s_addr))
                        continue;
                else if (addr->ss_family == AF_INET6 &&
-                         !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
+                         (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
-                                          &addr6->sin6_addr))
+                                           &addr6->sin6_addr) ||
+                          server->addr.sockAddr6.sin6_scope_id !=
+                                           addr6->sin6_scope_id))
                        continue;
                ++server->srv_count;
@@ -1439,28 +1451,15 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        memset(&addr, 0, sizeof(struct sockaddr_storage));
-        if (volume_info->UNCip && volume_info->UNC) {
+        cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip));
-                rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
-                                    &sin_server->sin_addr.s_addr);
-                if (rc <= 0) {
-                        /* not ipv4 address, try ipv6 */
-                        rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
-                                            &sin_server6->sin6_addr.in6_u);
-                        if (rc > 0)
-                                addr.ss_family = AF_INET6;
-                } else {
-                        addr.ss_family = AF_INET;
-                }
-                if (rc <= 0) {
+        if (volume_info->UNCip && volume_info->UNC) {
+                rc = cifs_convert_address(volume_info->UNCip, &addr);
+                if (!rc) {
                        /* we failed translating address */
                        rc = -EINVAL;
                        goto out_err;
                }
-                cFYI(1, ("UNC: %s ip: %s", volume_info->UNC,
-                         volume_info->UNCip));
        } else if (volume_info->UNCip) {
                /* BB using ip addr as tcp_ses name to connect to the
                   DFS root below */
@@ -1519,14 +1518,14 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                cFYI(1, ("attempting ipv6 connect"));
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
+                sin_server6->sin6_port = htons(volume_info->port);
                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
                        sizeof(struct sockaddr_in6));
-                sin_server6->sin6_port = htons(volume_info->port);
                rc = ipv6_connect(tcp_ses);
        } else {
+                sin_server->sin_port = htons(volume_info->port);
                memcpy(&tcp_ses->addr.sockAddr, sin_server,
                        sizeof(struct sockaddr_in));
-                sin_server->sin_port = htons(volume_info->port);
                rc = ipv4_connect(tcp_ses);
        }
        if (rc < 0) {
@@ -2471,10 +2470,10 @@ try_mount_again:
                tcon->local_lease = volume_info->local_lease;
        }
        if (pSesInfo) {
-                if (pSesInfo->capabilities & CAP_LARGE_FILES) {
+                if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                        sb->s_maxbytes = (u64) 1 << 63;
+                        sb->s_maxbytes = MAX_LFS_FILESIZE;
-                } else
+                else
-                        sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */
+                        sb->s_maxbytes = MAX_NON_LFS;
        }
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
@@ -2563,11 +2562,20 @@ remote_path_check:
                        if (mount_data != mount_data_global)
                                kfree(mount_data);
                        mount_data = cifs_compose_mount_options(
                                        cifs_sb->mountdata, full_path + 1,
                                        referrals, &fake_devname);
-                        kfree(fake_devname);
                        free_dfs_info_array(referrals, num_referrals);
+                        kfree(fake_devname);
+                        kfree(full_path);
+                        if (IS_ERR(mount_data)) {
+                                rc = PTR_ERR(mount_data);
+                                mount_data = NULL;
+                                goto mount_fail_check;
+                        }
                        if (tcon)
                                cifs_put_tcon(tcon);
@@ -2575,8 +2583,6 @@ remote_path_check:
                                cifs_put_smb_ses(pSesInfo);
                        cleanup_volume_info(&volume_info);
-                        FreeXid(xid);
-                        kfree(full_path);
                        referral_walks_count++;
                        goto try_mount_again;
                }
@@ -2745,6 +2751,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
                /* mostly informational -- no need to fail on error here */
+                kfree(tcon->nativeFileSystem);
                tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
                                                      bytes_left, is_unicode,
                                                      nls_codepage);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3758965d73d5..4326ffd90fa9 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -188,6 +188,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        FILE_UNIX_BASIC_INFO *presp_data;
        __u32 posix_flags = 0;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        struct cifs_fattr fattr;
        cFYI(1, ("posix open %s", full_path));
@@ -236,22 +237,21 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        if (presp_data->Type == cpu_to_le32(-1))
                goto posix_open_ret; /* open ok, caller does qpathinfo */
-        /* get new inode and set it up */
        if (!pinode)
                goto posix_open_ret; /* caller does not need info */
+        cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
+        /* get new inode and set it up */
        if (*pinode == NULL) {
-                __u64 unique_id = le64_to_cpu(presp_data->UniqueId);
+                *pinode = cifs_iget(sb, &fattr);
-                *pinode = cifs_new_inode(sb, &unique_id);
+                if (!*pinode) {
+                        rc = -ENOMEM;
+                        goto posix_open_ret;
+                }
+        } else {
+                cifs_fattr_to_inode(*pinode, &fattr);
        }
-        /* else an inode was passed in. Update its info, don't create one */
-        /* We do not need to close the file if new_inode fails since
-           the caller will retry qpathinfo as long as inode is null */
-        if (*pinode == NULL)
-                goto posix_open_ret;
-        posix_fill_in_inode(*pinode, presp_data, 1);
        cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
@@ -307,8 +307,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        if (oplockEnabled)
@@ -424,9 +425,10 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
                }
-                CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+                CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
-                        cifs_sb->local_nls,
+                                        cifs_sb->local_nls,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                        cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else {
                /* BB implement mode setting via Windows security
                   descriptors e.g. */
@@ -514,10 +516,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
                }
-                rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path,
+                rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
-                        &args, cifs_sb->local_nls,
+                                            cifs_sb->local_nls,
-                        cifs_sb->mnt_cifs_flags &
+                                            cifs_sb->mnt_cifs_flags &
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                if (!rc) {
                        rc = cifs_get_inode_info_unix(&newinode, full_path,
@@ -540,8 +542,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
                        if (buf == NULL) {
                                kfree(full_path);
+                                rc = -ENOMEM;
                                FreeXid(xid);
-                                return -ENOMEM;
+                                return rc;
                        }
                        rc = CIFSSMBOpen(xid, pTcon, full_path,
@@ -641,6 +644,15 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                        }
        }
+        /*
+         * O_EXCL: optimize away the lookup, but don't hash the dentry. Let
+         * the VFS handle the create.
+         */
+        if (nd->flags & LOOKUP_EXCL) {
+                d_instantiate(direntry, NULL);
+                return 0;
+        }
        /* can not grab the rename sem here since it would
        deadlock in the cases (beginning of sys_rename itself)
        in which we already have the sb rename sem */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index df4a306f697e..87948147d7ec 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -35,26 +35,11 @@
 *              0 - name is not IP
 */
 static int
-is_ip(const char *name)
+is_ip(char *name)
 {
-        int rc;
+        struct sockaddr_storage ss;
-        struct sockaddr_in sin_server;
-        struct sockaddr_in6 sin_server6;
+        return cifs_convert_address(name, &ss);
-        rc = cifs_inet_pton(AF_INET, name,
-                        &sin_server.sin_addr.s_addr);
-        if (rc <= 0) {
-                /* not ipv4 address, try ipv6 */
-                rc = cifs_inet_pton(AF_INET6, name,
-                                &sin_server6.sin6_addr.in6_u);
-                if (rc > 0)
-                        return 1;
-        } else {
-                return 1;
-        }
-        /* we failed translating address */
-        return 0;
 }
 static int
@@ -72,7 +57,7 @@ dns_resolver_instantiate(struct key *key, const void *data,
        ip[datalen] = '\0';
        /* make sure this looks like an address */
-        if (!is_ip((const char *) ip)) {
+        if (!is_ip(ip)) {
                kfree(ip);
                return -EINVAL;
        }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 302ea15f02e6..c34b7f8a217b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
        /* BB need same check in cifs_create too? */
        /* if not oplocked, invalidate inode pages if mtime or file
           size changed */
-        temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+        temp = cifs_NTtimeToUnix(buf->LastWriteTime);
        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
                           (file->f_path.dentry->d_inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
@@ -300,14 +300,16 @@ int cifs_open(struct inode *inode, struct file *file)
        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
        pCifsFile = cifs_fill_filedata(file);
        if (pCifsFile) {
+                rc = 0;
                FreeXid(xid);
-                return 0;
+                return rc;
        }
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
@@ -446,9 +448,9 @@ int cifs_open(struct inode *inode, struct file *file)
                                .mtime  = NO_CHANGE_64,
                                .device = 0,
                        };
-                        CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+                        CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
-                                            cifs_sb->local_nls,
+                                               cifs_sb->local_nls,
-                                            cifs_sb->mnt_cifs_flags &
+                                               cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                }
        }
@@ -491,11 +493,12 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
                return -EBADF;
        xid = GetXid();
-        mutex_unlock(&pCifsFile->fh_mutex);
+        mutex_lock(&pCifsFile->fh_mutex);
        if (!pCifsFile->invalidHandle) {
-                mutex_lock(&pCifsFile->fh_mutex);
+                mutex_unlock(&pCifsFile->fh_mutex);
+                rc = 0;
                FreeXid(xid);
-                return 0;
+                return rc;
        }
        if (file->f_path.dentry == NULL) {
@@ -524,7 +527,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        if (full_path == NULL) {
                rc = -ENOMEM;
 reopen_error_exit:
-                mutex_lock(&pCifsFile->fh_mutex);
+                mutex_unlock(&pCifsFile->fh_mutex);
                FreeXid(xid);
                return rc;
        }
@@ -566,14 +569,14 @@ reopen_error_exit:
                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                mutex_lock(&pCifsFile->fh_mutex);
+                mutex_unlock(&pCifsFile->fh_mutex);
                cFYI(1, ("cifs_open returned 0x%x", rc));
                cFYI(1, ("oplock: %d", oplock));
        } else {
 reopen_success:
                pCifsFile->netfid = netfid;
                pCifsFile->invalidHandle = false;
-                mutex_lock(&pCifsFile->fh_mutex);
+                mutex_unlock(&pCifsFile->fh_mutex);
                pCifsInode = CIFS_I(inode);
                if (pCifsInode) {
                        if (can_flush) {
@@ -845,8 +848,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        tcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
@@ -1805,8 +1809,9 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
        pTcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        open_file = (struct cifsFileInfo *)file->private_data;
@@ -1885,8 +1890,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        pTcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        open_file = (struct cifsFileInfo *)file->private_data;
@@ -2019,8 +2025,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        xid = GetXid();
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        open_file = (struct cifsFileInfo *)file->private_data;
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -2185,8 +2192,9 @@ static int cifs_readpage(struct file *file, struct page *page)
        xid = GetXid();
        if (file->private_data == NULL) {
+                rc = -EBADF;
                FreeXid(xid);
-                return -EBADF;
+                return rc;
        }
        cFYI(1, ("readpage %p at offset %d 0x%x\n",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c869a6dcba1..82d83839655e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -77,239 +77,202 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
        }
 }
-static void cifs_unix_info_to_inode(struct inode *inode,
+/* populate an inode with info from a cifs_fattr struct */
-                FILE_UNIX_BASIC_INFO *info, int force_uid_gid)
+void
+cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 {
+        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
+        unsigned long oldtime = cifs_i->time;
-        __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
-        __u64 end_of_file = le64_to_cpu(info->EndOfFile);
+        inode->i_atime = fattr->cf_atime;
+        inode->i_mtime = fattr->cf_mtime;
+        inode->i_ctime = fattr->cf_ctime;
+        inode->i_rdev = fattr->cf_rdev;
+        inode->i_nlink = fattr->cf_nlink;
+        inode->i_uid = fattr->cf_uid;
+        inode->i_gid = fattr->cf_gid;
+        /* if dynperm is set, don't clobber existing mode */
+        if (inode->i_state & I_NEW ||
+            !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM))
+                inode->i_mode = fattr->cf_mode;
+        cifs_i->cifsAttrs = fattr->cf_cifsattrs;
+        cifs_i->uniqueid = fattr->cf_uniqueid;
+        if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+                cifs_i->time = 0;
+        else
+                cifs_i->time = jiffies;
+        cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode,
+                 oldtime, cifs_i->time));
-        inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime));
+        cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
-        inode->i_mtime =
-                cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime));
+        /*
-        inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange));
+         * Can't safely change the file size here if the client is writing to
-        inode->i_mode = le64_to_cpu(info->Permissions);
+         * it due to potential races.
+         */
+        spin_lock(&inode->i_lock);
+        if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
+                i_size_write(inode, fattr->cf_eof);
+                /*
+                 * i_blocks is not related to (i_size / i_blksize),
+                 * but instead 512 byte (2**9) size is required for
+                 * calculating num blocks.
+                 */
+                inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9;
+        }
+        spin_unlock(&inode->i_lock);
+        cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
+}
+/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
+void
+cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
+                         struct cifs_sb_info *cifs_sb)
+{
+        memset(fattr, 0, sizeof(*fattr));
+        fattr->cf_uniqueid = le64_to_cpu(info->UniqueId);
+        fattr->cf_bytes = le64_to_cpu(info->NumOfBytes);
+        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+        fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime);
+        fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
+        fattr->cf_mode = le64_to_cpu(info->Permissions);
        /*
         * Since we set the inode type below we need to mask off
         * to avoid strange results if bits set above.
         */
-        inode->i_mode &= ~S_IFMT;
+        fattr->cf_mode &= ~S_IFMT;
        switch (le32_to_cpu(info->Type)) {
        case UNIX_FILE:
-                inode->i_mode |= S_IFREG;
+                fattr->cf_mode |= S_IFREG;
+                fattr->cf_dtype = DT_REG;
                break;
        case UNIX_SYMLINK:
-                inode->i_mode |= S_IFLNK;
+                fattr->cf_mode |= S_IFLNK;
+                fattr->cf_dtype = DT_LNK;
                break;
        case UNIX_DIR:
-                inode->i_mode |= S_IFDIR;
+                fattr->cf_mode |= S_IFDIR;
+                fattr->cf_dtype = DT_DIR;
                break;
        case UNIX_CHARDEV:
-                inode->i_mode |= S_IFCHR;
+                fattr->cf_mode |= S_IFCHR;
-                inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                fattr->cf_dtype = DT_CHR;
-                                      le64_to_cpu(info->DevMinor) & MINORMASK);
+                fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                                       le64_to_cpu(info->DevMinor) & MINORMASK);
                break;
        case UNIX_BLOCKDEV:
-                inode->i_mode |= S_IFBLK;
+                fattr->cf_mode |= S_IFBLK;
-                inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                fattr->cf_dtype = DT_BLK;
-                                      le64_to_cpu(info->DevMinor) & MINORMASK);
+                fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                                       le64_to_cpu(info->DevMinor) & MINORMASK);
                break;
        case UNIX_FIFO:
-                inode->i_mode |= S_IFIFO;
+                fattr->cf_mode |= S_IFIFO;
+                fattr->cf_dtype = DT_FIFO;
                break;
        case UNIX_SOCKET:
-                inode->i_mode |= S_IFSOCK;
+                fattr->cf_mode |= S_IFSOCK;
+                fattr->cf_dtype = DT_SOCK;
                break;
        default:
                /* safest to call it a file if we do not know */
-                inode->i_mode |= S_IFREG;
+                fattr->cf_mode |= S_IFREG;
+                fattr->cf_dtype = DT_REG;
                cFYI(1, ("unknown type %d", le32_to_cpu(info->Type)));
                break;
        }
-        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) &&
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
-            !force_uid_gid)
+                fattr->cf_uid = cifs_sb->mnt_uid;
-                inode->i_uid = cifs_sb->mnt_uid;
        else
-                inode->i_uid = le64_to_cpu(info->Uid);
+                fattr->cf_uid = le64_to_cpu(info->Uid);
-        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) &&
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
-            !force_uid_gid)
+                fattr->cf_gid = cifs_sb->mnt_gid;
-                inode->i_gid = cifs_sb->mnt_gid;
        else
-                inode->i_gid = le64_to_cpu(info->Gid);
+                fattr->cf_gid = le64_to_cpu(info->Gid);
-        inode->i_nlink = le64_to_cpu(info->Nlinks);
-        cifsInfo->server_eof = end_of_file;
-        spin_lock(&inode->i_lock);
-        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
-                /*
-                 * We can not safely change the file size here if the client
-                 * is writing to it due to potential races.
-                 */
-                i_size_write(inode, end_of_file);
-                /*
+        fattr->cf_nlink = le64_to_cpu(info->Nlinks);
-                 * i_blocks is not related to (i_size / i_blksize),
-                 * but instead 512 byte (2**9) size is required for
-                 * calculating num blocks.
-                 */
-                inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
-        }
-        spin_unlock(&inode->i_lock);
 }
 /*
- *      Needed to setup inode data for the directory which is the
+ * Fill a cifs_fattr struct with fake inode info.
- *      junction to the new submount (ie to setup the fake directory
- *      which represents a DFS referral)
- */
-static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
-                               struct super_block *sb)
-{
-        struct inode *pinode = NULL;
-        memset(pfnd_dat, 0, sizeof(FILE_UNIX_BASIC_INFO));
-/*      __le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
-        __le64 pfnd_dat->NumOfBytes = cpu_to_le64(0);
-        __u64 UniqueId = 0;  */
-        pfnd_dat->LastStatusChange =
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        pfnd_dat->LastAccessTime =
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        pfnd_dat->LastModificationTime =
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        pfnd_dat->Type = cpu_to_le32(UNIX_DIR);
-        pfnd_dat->Permissions = cpu_to_le64(S_IXUGO | S_IRWXU);
-        pfnd_dat->Nlinks = cpu_to_le64(2);
-        if (sb->s_root)
-                pinode = sb->s_root->d_inode;
-        if (pinode == NULL)
-                return;
-        /* fill in default values for the remaining based on root
-           inode since we can not query the server for this inode info */
-        pfnd_dat->DevMajor = cpu_to_le64(MAJOR(pinode->i_rdev));
-        pfnd_dat->DevMinor = cpu_to_le64(MINOR(pinode->i_rdev));
-        pfnd_dat->Uid = cpu_to_le64(pinode->i_uid);
-        pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
-}
-/**
- * cifs_new inode - create new inode, initialize, and hash it
- * @sb - pointer to superblock
- * @inum - if valid pointer and serverino is enabled, replace i_ino with val
- *
- * Create a new inode, initialize it for CIFS and hash it. Returns the new
- * inode or NULL if one couldn't be allocated.
 *
- * If the share isn't mounted with "serverino" or inum is a NULL pointer then
+ * Needed to setup cifs_fattr data for the directory which is the
- * we'll just use the inode number assigned by new_inode(). Note that this can
+ * junction to the new submount (ie to setup the fake directory
- * mean i_ino collisions since the i_ino assigned by new_inode is not
+ * which represents a DFS referral).
- * guaranteed to be unique.
 */
-struct inode *
+static void
-cifs_new_inode(struct super_block *sb, __u64 *inum)
+cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
-        struct inode *inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        inode = new_inode(sb);
-        if (inode == NULL)
-                return NULL;
-        /*
-         * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
-         *     stop passing inum as ptr. Are there sanity checks we can use to
-         *     ensure that the server is really filling in that field? Also,
-         *     if serverino is disabled, perhaps we should be using iunique()?
-         */
-        if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
-                inode->i_ino = (unsigned long) *inum;
-        /*
-         * must set this here instead of cifs_alloc_inode since VFS will
-         * clobber i_flags
-         */
-        if (sb->s_flags & MS_NOATIME)
-                inode->i_flags |= S_NOATIME | S_NOCMTIME;
-        insert_inode_hash(inode);
-        return inode;
+        cFYI(1, ("creating fake fattr for DFS referral"));
+        memset(fattr, 0, sizeof(*fattr));
+        fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
+        fattr->cf_uid = cifs_sb->mnt_uid;
+        fattr->cf_gid = cifs_sb->mnt_gid;
+        fattr->cf_atime = CURRENT_TIME;
+        fattr->cf_ctime = CURRENT_TIME;
+        fattr->cf_mtime = CURRENT_TIME;
+        fattr->cf_nlink = 2;
+        fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
 }
 int cifs_get_inode_info_unix(struct inode **pinode,
-        const unsigned char *full_path, struct super_block *sb, int xid)
+                             const unsigned char *full_path,
+                             struct super_block *sb, int xid)
 {
-        int rc = 0;
+        int rc;
        FILE_UNIX_BASIC_INFO find_data;
-        struct cifsTconInfo *pTcon;
+        struct cifs_fattr fattr;
-        struct inode *inode;
+        struct cifsTconInfo *tcon;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        bool is_dfs_referral = false;
-        struct cifsInodeInfo *cifsInfo;
-        __u64 num_of_bytes;
-        __u64 end_of_file;
-        pTcon = cifs_sb->tcon;
+        tcon = cifs_sb->tcon;
        cFYI(1, ("Getting info on %s", full_path));
        /* could have done a find first instead but this returns more info */
-        rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
+        rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
                                  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc == -EREMOTE && !is_dfs_referral) {
-                is_dfs_referral = true;
-                cFYI(DBG2, ("DFS ref"));
-                /* for DFS, server does not give us real inode data */
-                fill_fake_finddataunix(&find_data, sb);
-                rc = 0;
-        } else if (rc)
-                goto cgiiu_exit;
-        num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
+        if (!rc) {
-        end_of_file = le64_to_cpu(find_data.EndOfFile);
+                cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+        } else if (rc == -EREMOTE) {
+                cifs_create_dfs_fattr(&fattr, sb);
+                rc = 0;
+        } else {
+                return rc;
+        }
-        /* get new inode */
        if (*pinode == NULL) {
-                __u64 unique_id = le64_to_cpu(find_data.UniqueId);
+                /* get new inode */
-                *pinode = cifs_new_inode(sb, &unique_id);
+                *pinode = cifs_iget(sb, &fattr);
-                if (*pinode == NULL) {
+                if (!*pinode)
                        rc = -ENOMEM;
-                        goto cgiiu_exit;
+        } else {
-                }
+                /* we already have inode, update it */
+                cifs_fattr_to_inode(*pinode, &fattr);
        }
-        inode = *pinode;
-        cifsInfo = CIFS_I(inode);
-        cFYI(1, ("Old time %ld", cifsInfo->time));
-        cifsInfo->time = jiffies;
-        cFYI(1, ("New time %ld", cifsInfo->time));
-        /* this is ok to set on every inode revalidate */
-        atomic_set(&cifsInfo->inUse, 1);
-        cifs_unix_info_to_inode(inode, &find_data, 0);
-        if (num_of_bytes < end_of_file)
-                cFYI(1, ("allocation size less than end of file"));
-        cFYI(1, ("Size %ld and blocks %llu",
-                (unsigned long) inode->i_size,
-                (unsigned long long)inode->i_blocks));
-        cifs_set_ops(inode, is_dfs_referral);
-cgiiu_exit:
        return rc;
 }
-static int decode_sfu_inode(struct inode *inode, __u64 size,
+static int
-                            const unsigned char *path,
+cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
-                            struct cifs_sb_info *cifs_sb, int xid)
+              struct cifs_sb_info *cifs_sb, int xid)
 {
        int rc;
        int oplock = 0;
@@ -321,10 +284,15 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
        pbuf = buf;
-        if (size == 0) {
+        fattr->cf_mode &= ~S_IFMT;
-                inode->i_mode |= S_IFIFO;
+        if (fattr->cf_eof == 0) {
+                fattr->cf_mode |= S_IFIFO;
+                fattr->cf_dtype = DT_FIFO;
                return 0;
-        } else if (size < 8) {
+        } else if (fattr->cf_eof < 8) {
+                fattr->cf_mode |= S_IFREG;
+                fattr->cf_dtype = DT_REG;
                return -EINVAL;  /* EOPNOTSUPP? */
        }
@@ -336,42 +304,46 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
        if (rc == 0) {
                int buf_type = CIFS_NO_BUFFER;
                        /* Read header */
-                rc = CIFSSMBRead(xid, pTcon,
+                rc = CIFSSMBRead(xid, pTcon, netfid,
-                                 netfid,
                                 24 /* length */, 0 /* offset */,
                                 &bytes_read, &pbuf, &buf_type);
                if ((rc == 0) && (bytes_read >= 8)) {
                        if (memcmp("IntxBLK", pbuf, 8) == 0) {
                                cFYI(1, ("Block device"));
-                                inode->i_mode |= S_IFBLK;
+                                fattr->cf_mode |= S_IFBLK;
+                                fattr->cf_dtype = DT_BLK;
                                if (bytes_read == 24) {
                                        /* we have enough to decode dev num */
                                        __u64 mjr; /* major */
                                        __u64 mnr; /* minor */
                                        mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
                                        mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-                                        inode->i_rdev = MKDEV(mjr, mnr);
+                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
                                cFYI(1, ("Char device"));
-                                inode->i_mode |= S_IFCHR;
+                                fattr->cf_mode |= S_IFCHR;
+                                fattr->cf_dtype = DT_CHR;
                                if (bytes_read == 24) {
                                        /* we have enough to decode dev num */
                                        __u64 mjr; /* major */
                                        __u64 mnr; /* minor */
                                        mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
                                        mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-                                        inode->i_rdev = MKDEV(mjr, mnr);
+                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
                                cFYI(1, ("Symlink"));
-                                inode->i_mode |= S_IFLNK;
+                                fattr->cf_mode |= S_IFLNK;
+                                fattr->cf_dtype = DT_LNK;
                        } else {
-                                inode->i_mode |= S_IFREG; /* file? */
+                                fattr->cf_mode |= S_IFREG; /* file? */
+                                fattr->cf_dtype = DT_REG;
                                rc = -EOPNOTSUPP;
                        }
                } else {
-                        inode->i_mode |= S_IFREG; /* then it is a file */
+                        fattr->cf_mode |= S_IFREG; /* then it is a file */
+                        fattr->cf_dtype = DT_REG;
                        rc = -EOPNOTSUPP; /* or some unknown SFU type */
                }
                CIFSSMBClose(xid, pTcon, netfid);
@@ -381,9 +353,13 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
 #define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID)  /* SETFILEBITS valid bits */
-static int get_sfu_mode(struct inode *inode,
+/*
-                        const unsigned char *path,
+ * Fetch mode bits as provided by SFU.
-                        struct cifs_sb_info *cifs_sb, int xid)
+ *
+ * FIXME: Doesn't this clobber the type bit we got from cifs_sfu_type ?
+ */
+static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
+                         struct cifs_sb_info *cifs_sb, int xid)
 {
 #ifdef CONFIG_CIFS_XATTR
        ssize_t rc;
@@ -391,68 +367,80 @@ static int get_sfu_mode(struct inode *inode,
        __u32 mode;
        rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
-                        ea_value, 4 /* size of buf */, cifs_sb->local_nls,
+                            ea_value, 4 /* size of buf */, cifs_sb->local_nls,
-                cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                            cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc < 0)
                return (int)rc;
        else if (rc > 3) {
                mode = le32_to_cpu(*((__le32 *)ea_value));
-                inode->i_mode &= ~SFBITS_MASK;
+                fattr->cf_mode &= ~SFBITS_MASK;
-                cFYI(1, ("special bits 0%o org mode 0%o", mode, inode->i_mode));
+                cFYI(1, ("special bits 0%o org mode 0%o", mode,
-                inode->i_mode = (mode &  SFBITS_MASK) | inode->i_mode;
+                         fattr->cf_mode));
+                fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
                cFYI(1, ("special mode bits 0%o", mode));
-                return 0;
-        } else {
-                return 0;
        }
+        return 0;
 #else
        return -EOPNOTSUPP;
 #endif
 }
-/*
+/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
- *      Needed to setup inode data for the directory which is the
+static void
- *      junction to the new submount (ie to setup the fake directory
+cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
- *      which represents a DFS referral)
+                       struct cifs_sb_info *cifs_sb, bool adjust_tz)
- */
-static void fill_fake_finddata(FILE_ALL_INFO *pfnd_dat,
-                               struct super_block *sb)
 {
-        memset(pfnd_dat, 0, sizeof(FILE_ALL_INFO));
+        memset(fattr, 0, sizeof(*fattr));
+        fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
-/*      __le64 pfnd_dat->AllocationSize = cpu_to_le64(0);
+        if (info->DeletePending)
-        __le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
+                fattr->cf_flags |= CIFS_FATTR_DELETE_PENDING;
-        __u8 pfnd_dat->DeletePending = 0;
-        __u8 pfnd_data->Directory = 0;
+        if (info->LastAccessTime)
-        __le32 pfnd_dat->EASize = 0;
+                fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
-        __u64 pfnd_dat->IndexNumber = 0;
+        else
-        __u64 pfnd_dat->IndexNumber1 = 0;  */
+                fattr->cf_atime = CURRENT_TIME;
-        pfnd_dat->CreationTime =
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+        fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
-        pfnd_dat->LastAccessTime =
+        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        pfnd_dat->LastWriteTime =
+        if (adjust_tz) {
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+                fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
-        pfnd_dat->ChangeTime =
+                fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
-                cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+        }
-        pfnd_dat->Attributes = cpu_to_le32(ATTR_DIRECTORY);
-        pfnd_dat->NumberOfLinks = cpu_to_le32(2);
+        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
+                fattr->cf_dtype = DT_DIR;
+        } else {
+                fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
+                fattr->cf_dtype = DT_REG;
+                /* clear write bits if ATTR_READONLY is set */
+                if (fattr->cf_cifsattrs & ATTR_READONLY)
+                        fattr->cf_mode &= ~(S_IWUGO);
+        }
+        fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+        fattr->cf_uid = cifs_sb->mnt_uid;
+        fattr->cf_gid = cifs_sb->mnt_gid;
 }
 int cifs_get_inode_info(struct inode **pinode,
        const unsigned char *full_path, FILE_ALL_INFO *pfindData,
        struct super_block *sb, int xid, const __u16 *pfid)
 {
-        int rc = 0;
+        int rc = 0, tmprc;
-        __u32 attr;
-        struct cifsInodeInfo *cifsInfo;
        struct cifsTconInfo *pTcon;
-        struct inode *inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        char *buf = NULL;
        bool adjustTZ = false;
-        bool is_dfs_referral = false;
+        struct cifs_fattr fattr;
-        umode_t default_mode;
        pTcon = cifs_sb->tcon;
        cFYI(1, ("Getting info on %s", full_path));
@@ -487,166 +475,85 @@ int cifs_get_inode_info(struct inode **pinode,
                        adjustTZ = true;
                }
        }
-        /* dump_mem("\nQPathInfo return data",&findData, sizeof(findData)); */
-        if (rc == -EREMOTE) {
+        if (!rc) {
-                is_dfs_referral = true;
+                cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *) pfindData,
-                fill_fake_finddata(pfindData, sb);
+                                       cifs_sb, adjustTZ);
+        } else if (rc == -EREMOTE) {
+                cifs_create_dfs_fattr(&fattr, sb);
                rc = 0;
-        } else if (rc)
+        } else {
                goto cgii_exit;
+        }
-        attr = le32_to_cpu(pfindData->Attributes);
+        /*
+         * If an inode wasn't passed in, then get the inode number
-        /* get new inode */
+         *
+         * Is an i_ino of zero legal? Can we use that to check if the server
+         * supports returning inode numbers?  Are there other sanity checks we
+         * can use to ensure that the server is really filling in that field?
+         *
+         * We can not use the IndexNumber field by default from Windows or
+         * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA
+         * CIFS spec claims that this value is unique within the scope of a
+         * share, and the windows docs hint that it's actually unique
+         * per-machine.
+         *
+         * There may be higher info levels that work but are there Windows
+         * server or network appliances for which IndexNumber field is not
+         * guaranteed unique?
+         */
        if (*pinode == NULL) {
-                __u64 inode_num;
-                __u64 *pinum = &inode_num;
-                /* Is an i_ino of zero legal? Can we use that to check
-                   if the server supports returning inode numbers?  Are
-                   there other sanity checks we can use to ensure that
-                   the server is really filling in that field? */
-                /* We can not use the IndexNumber field by default from
-                   Windows or Samba (in ALL_INFO buf) but we can request
-                   it explicitly.  It may not be unique presumably if
-                   the server has multiple devices mounted under one share */
-                /* There may be higher info levels that work but are
-                   there Windows server or network appliances for which
-                   IndexNumber field is not guaranteed unique? */
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                        int rc1 = 0;
                        rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-                                        full_path, pinum,
+                                        full_path, &fattr.cf_uniqueid,
                                        cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1) {
                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
-                                pinum = NULL;
+                                fattr.cf_uniqueid = iunique(sb, ROOT_I);
-                                /* BB EOPNOSUPP disable SERVER_INUM? */
+                                /* disable serverino if call not supported */
+                                if (rc1 == -EINVAL)
+                                        cifs_sb->mnt_cifs_flags &=
+                                                        ~CIFS_MOUNT_SERVER_INUM;
                        }
                } else {
-                        pinum = NULL;
+                        fattr.cf_uniqueid = iunique(sb, ROOT_I);
-                }
-                *pinode = cifs_new_inode(sb, pinum);
-                if (*pinode == NULL) {
-                        rc = -ENOMEM;
-                        goto cgii_exit;
                }
-        }
-        inode = *pinode;
-        cifsInfo = CIFS_I(inode);
-        cifsInfo->cifsAttrs = attr;
-        cifsInfo->delete_pending = pfindData->DeletePending ? true : false;
-        cFYI(1, ("Old time %ld", cifsInfo->time));
-        cifsInfo->time = jiffies;
-        cFYI(1, ("New time %ld", cifsInfo->time));
-        /* blksize needs to be multiple of two. So safer to default to
-        blksize and blkbits set in superblock so 2**blkbits and blksize
-        will match rather than setting to:
-        (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
-        /* Linux can not store file creation time so ignore it */
-        if (pfindData->LastAccessTime)
-                inode->i_atime = cifs_NTtimeToUnix
-                        (le64_to_cpu(pfindData->LastAccessTime));
-        else /* do not need to use current_fs_time - time not stored */
-                inode->i_atime = CURRENT_TIME;
-        inode->i_mtime =
-                    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-        inode->i_ctime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
-        cFYI(DBG2, ("Attributes came in as 0x%x", attr));
-        if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
-                inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
-                inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
-        }
-        /* get default inode mode */
-        if (attr & ATTR_DIRECTORY)
-                default_mode = cifs_sb->mnt_dir_mode;
-        else
-                default_mode = cifs_sb->mnt_file_mode;
-        /* set permission bits */
-        if (atomic_read(&cifsInfo->inUse) == 0 ||
-            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
-                inode->i_mode = default_mode;
-        else {
-                /* just reenable write bits if !ATTR_READONLY */
-                if ((inode->i_mode & S_IWUGO) == 0 &&
-                    (attr & ATTR_READONLY) == 0)
-                        inode->i_mode |= (S_IWUGO & default_mode);
-                inode->i_mode &= ~S_IFMT;
-        }
-        /* clear write bits if ATTR_READONLY is set */
-        if (attr & ATTR_READONLY)
-                inode->i_mode &= ~S_IWUGO;
-        /* set inode type */
-        if ((attr & ATTR_SYSTEM) &&
-            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
-                /* no need to fix endianness on 0 */
-                if (pfindData->EndOfFile == 0)
-                        inode->i_mode |= S_IFIFO;
-                else if (decode_sfu_inode(inode,
-                                le64_to_cpu(pfindData->EndOfFile),
-                                full_path, cifs_sb, xid))
-                        cFYI(1, ("unknown SFU file type\n"));
        } else {
-                if (attr & ATTR_DIRECTORY)
+                fattr.cf_uniqueid = CIFS_I(*pinode)->uniqueid;
-                        inode->i_mode |= S_IFDIR;
-                else
-                        inode->i_mode |= S_IFREG;
        }
-        cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile);
+        /* query for SFU type info if supported and needed */
-        spin_lock(&inode->i_lock);
+        if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
-        if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) {
+            cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-                /* can not safely shrink the file size here if the
+                tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
-                   client is writing to it due to potential races */
+                if (tmprc)
-                i_size_write(inode, cifsInfo->server_eof);
+                        cFYI(1, ("cifs_sfu_type failed: %d", tmprc));
-                /* 512 bytes (2**9) is the fake blocksize that must be
-                   used for this calculation */
-                inode->i_blocks = (512 - 1 + le64_to_cpu(
-                                   pfindData->AllocationSize)) >> 9;
        }
-        spin_unlock(&inode->i_lock);
-        inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks);
-        /* BB fill in uid and gid here? with help from winbind?
-           or retrieve from NTFS stream extended attribute */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
                cFYI(1, ("Getting mode bits from ACL"));
-                acl_to_uid_mode(inode, full_path, pfid);
+                cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
        }
 #endif
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-                /* fill in remaining high mode bits e.g. SUID, VTX */
-                get_sfu_mode(inode, full_path, cifs_sb, xid);
-        } else if (atomic_read(&cifsInfo->inUse) == 0) {
-                inode->i_uid = cifs_sb->mnt_uid;
-                inode->i_gid = cifs_sb->mnt_gid;
-                /* set so we do not keep refreshing these fields with
-                   bad data after user has changed them in memory */
-                atomic_set(&cifsInfo->inUse, 1);
-        }
-        cifs_set_ops(inode, is_dfs_referral);
+        /* fill in remaining high mode bits e.g. SUID, VTX */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+                cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
+        if (!*pinode) {
+                *pinode = cifs_iget(sb, &fattr);
+                if (!*pinode)
+                        rc = -ENOMEM;
+        } else {
+                cifs_fattr_to_inode(*pinode, &fattr);
+        }
 cgii_exit:
        kfree(buf);
@@ -698,33 +605,78 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
        return full_path;
 }
+static int
+cifs_find_inode(struct inode *inode, void *opaque)
+{
+        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
+                return 0;
+        return 1;
+}
+static int
+cifs_init_inode(struct inode *inode, void *opaque)
+{
+        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+        CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+        return 0;
+}
+/* Given fattrs, get a corresponding inode */
+struct inode *
+cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
+{
+        unsigned long hash;
+        struct inode *inode;
+        cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
+        /* hash down to 32-bits on 32-bit arch */
+        hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
+        inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
+        /* we have fattrs in hand, update the inode */
+        if (inode) {
+                cifs_fattr_to_inode(inode, fattr);
+                if (sb->s_flags & MS_NOATIME)
+                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
+                if (inode->i_state & I_NEW) {
+                        inode->i_ino = hash;
+                        unlock_new_inode(inode);
+                }
+        }
+        return inode;
+}
 /* gets root inode */
-struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
        int xid;
        struct cifs_sb_info *cifs_sb;
-        struct inode *inode;
+        struct inode *inode = NULL;
        long rc;
        char *full_path;
-        inode = iget_locked(sb, ino);
+        cifs_sb = CIFS_SB(sb);
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
-        if (!(inode->i_state & I_NEW))
-                return inode;
-        cifs_sb = CIFS_SB(inode->i_sb);
        full_path = cifs_build_path_to_root(cifs_sb);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
        xid = GetXid();
        if (cifs_sb->tcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
-                                                xid);
        else
-                rc = cifs_get_inode_info(&inode, full_path, NULL, inode->i_sb,
+                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
                                                xid, NULL);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
        if (rc && cifs_sb->tcon->ipc) {
                cFYI(1, ("ipc connection - fake read inode"));
                inode->i_mode |= S_IFDIR;
@@ -740,7 +692,6 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
                return ERR_PTR(rc);
        }
-        unlock_new_inode(inode);
        kfree(full_path);
        /* can not call macro FreeXid here since in a void func
@@ -991,8 +942,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
         * sb->s_vfs_rename_mutex here */
        full_path = build_path_from_dentry(dentry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1065,44 +1017,6 @@ out_reval:
        return rc;
 }
-void posix_fill_in_inode(struct inode *tmp_inode,
-        FILE_UNIX_BASIC_INFO *pData, int isNewInode)
-{
-        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
-        loff_t local_size;
-        struct timespec local_mtime;
-        cifsInfo->time = jiffies;
-        atomic_inc(&cifsInfo->inUse);
-        /* save mtime and size */
-        local_mtime = tmp_inode->i_mtime;
-        local_size  = tmp_inode->i_size;
-        cifs_unix_info_to_inode(tmp_inode, pData, 1);
-        cifs_set_ops(tmp_inode, false);
-        if (!S_ISREG(tmp_inode->i_mode))
-                return;
-        /*
-         * No sense invalidating pages for new inode
-         * since we we have not started caching
-         * readahead file data yet.
-         */
-        if (isNewInode)
-                return;
-        if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
-                (local_size == tmp_inode->i_size)) {
-                cFYI(1, ("inode exists but unchanged"));
-        } else {
-                /* file may have changed on server */
-                cFYI(1, ("invalidate inode, readdir detected change"));
-                invalidate_remote_inode(tmp_inode);
-        }
-}
 int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 {
        int rc = 0, tmprc;
@@ -1111,6 +1025,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
+        struct cifs_fattr fattr;
        cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode));
@@ -1121,8 +1036,9 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1149,7 +1065,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        cFYI(1, ("posix mkdir returned 0x%x", rc));
                        d_drop(direntry);
                } else {
-                        __u64 unique_id;
                        if (pInfo->Type == cpu_to_le32(-1)) {
                                /* no return info, go query for it */
                                kfree(pInfo);
@@ -1163,20 +1078,15 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        else
                                direntry->d_op = &cifs_dentry_ops;
-                        unique_id = le64_to_cpu(pInfo->UniqueId);
+                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
-                        newinode = cifs_new_inode(inode->i_sb, &unique_id);
+                        newinode = cifs_iget(inode->i_sb, &fattr);
-                        if (newinode == NULL) {
+                        if (!newinode) {
                                kfree(pInfo);
                                goto mkdir_get_info;
                        }
-                        newinode->i_nlink = 2;
                        d_instantiate(direntry, newinode);
-                        /* we already checked in POSIXCreate whether
-                           frame was long enough */
-                        posix_fill_in_inode(direntry->d_inode,
-                                        pInfo, 1 /* NewInode */);
 #ifdef CONFIG_CIFS_DEBUG2
                        cFYI(1, ("instantiated dentry %p %s to inode %p",
                                direntry, direntry->d_name.name, newinode));
@@ -1239,10 +1149,10 @@ mkdir_get_info:
                                args.uid = NO_CHANGE_64;
                                args.gid = NO_CHANGE_64;
                        }
-                        CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+                        CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
-                                            cifs_sb->local_nls,
+                                               cifs_sb->local_nls,
-                                            cifs_sb->mnt_cifs_flags &
+                                               cifs_sb->mnt_cifs_flags &
-                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                } else {
                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
                            (mode & S_IWUGO) == 0) {
@@ -1306,8 +1216,9 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
@@ -1511,8 +1422,9 @@ int cifs_revalidate(struct dentry *direntry)
           since that would deadlock */
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
                 "jiffies %ld", full_path, direntry->d_inode,
@@ -1621,6 +1533,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        if (!err) {
                generic_fillattr(dentry->d_inode, stat);
                stat->blksize = CIFS_MAX_MSGSIZE;
+                stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
        }
        return err;
 }
@@ -1785,6 +1698,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsTconInfo *pTcon = cifs_sb->tcon;
        struct cifs_unix_set_info_args *args = NULL;
+        struct cifsFileInfo *open_file;
        cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
                 direntry->d_name.name, attrs->ia_valid));
@@ -1871,10 +1785,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                args->ctime = NO_CHANGE_64;
        args->device = 0;
-        rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, args,
+        open_file = find_writable_file(cifsInode);
-                                cifs_sb->local_nls,
+        if (open_file) {
-                                cifs_sb->mnt_cifs_flags &
+                u16 nfid = open_file->netfid;
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                u32 npid = open_file->pid;
+                rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
+                atomic_dec(&open_file->wrtPending);
+        } else {
+                rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
+                                    cifs_sb->local_nls,
+                                    cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+        }
        if (!rc)
                rc = inode_setattr(inode, attrs);
@@ -1914,8 +1836,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        /*
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index cd83c53fcbb5..fc1e0487eaee 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -172,8 +172,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        cFYI(1, ("Full path: %s", full_path));
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index e2fe998989a3..bd6d6895730d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -133,10 +133,12 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
        {0, 0}
 };
-/* Convert string containing dotted ip address to binary form */
+/*
-/* returns 0 if invalid address */
+ * Convert a string containing text IPv4 or IPv6 address to binary form.
+ *
-int
+ * Returns 0 on failure.
+ */
+static int
 cifs_inet_pton(const int address_family, const char *cp, void *dst)
 {
        int ret = 0;
@@ -153,6 +155,52 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
        return ret;
 }
+/*
+ * Try to convert a string to an IPv4 address and then attempt to convert
+ * it to an IPv6 address if that fails. Set the family field if either
+ * succeeds. If it's an IPv6 address and it has a '%' sign in it, try to
+ * treat the part following it as a numeric sin6_scope_id.
+ *
+ * Returns 0 on failure.
+ */
+int
+cifs_convert_address(char *src, void *dst)
+{
+        int rc;
+        char *pct, *endp;
+        struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
+        struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
+        /* IPv4 address */
+        if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) {
+                s4->sin_family = AF_INET;
+                return 1;
+        }
+        /* temporarily terminate string */
+        pct = strchr(src, '%');
+        if (pct)
+                *pct = '\0';
+        rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr);
+        /* repair temp termination (if any) and make pct point to scopeid */
+        if (pct)
+                *pct++ = '%';
+        if (!rc)
+                return rc;
+        s6->sin6_family = AF_INET6;
+        if (pct) {
+                s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
+                if (!*pct || *endp)
+                        return 0;
+        }
+        return rc;
+}
 /*****************************************************************************
 convert a NT status code to a dos class/code
 *****************************************************************************/
@@ -853,12 +901,12 @@ smbCalcSize_LE(struct smb_hdr *ptr)
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
-    /*
+/*
-     * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
-     * into Unix UTC (based 1970-01-01, in seconds).
+ * into Unix UTC (based 1970-01-01, in seconds).
-     */
+ */
 struct timespec
-cifs_NTtimeToUnix(u64 ntutc)
+cifs_NTtimeToUnix(__le64 ntutc)
 {
        struct timespec ts;
        /* BB what about the timezone? BB */
@@ -866,7 +914,7 @@ cifs_NTtimeToUnix(u64 ntutc)
        /* Subtract the NTFS time offset, then convert to 1s intervals. */
        u64 t;
-        t = ntutc - NTFS_TIME_OFFSET;
+        t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
        ts.tv_nsec = do_div(t, 10000000) * 100;
        ts.tv_sec = t;
        return ts;
@@ -883,16 +931,12 @@ cifs_UnixTimeToNT(struct timespec t)
 static int total_days_of_prev_months[] =
 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
+struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
-__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
-{
-        return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
-}
-struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
 {
        struct timespec ts;
        int sec, min, days, month, year;
+        u16 date = le16_to_cpu(le_date);
+        u16 time = le16_to_cpu(le_time);
        SMB_TIME *st = (SMB_TIME *)&time;
        SMB_DATE *sd = (SMB_DATE *)&date;
@@ -933,7 +977,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
                days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
        sec += 24 * 60 * 60 * days;
-        ts.tv_sec = sec;
+        ts.tv_sec = sec + offset;
        /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 964e097c8203..f823a4a208a7 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -63,386 +63,123 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 }
 #endif /* DEBUG2 */
-/* Returns 1 if new inode created, 2 if both dentry and inode were */
+/*
-/* Might check in the future if inode number changed so we can rehash inode */
+ * Find the dentry that matches "name". If there isn't one, create one. If it's
-static int
+ * a negative dentry or the uniqueid changed, then drop it and recreate it.
-construct_dentry(struct qstr *qstring, struct file *file,
+ */
-                 struct inode **ptmp_inode, struct dentry **pnew_dentry,
+static struct dentry *
-                 __u64 *inum)
+cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
+                    struct cifs_fattr *fattr)
 {
-        struct dentry *tmp_dentry = NULL;
+        struct dentry *dentry, *alias;
-        struct super_block *sb = file->f_path.dentry->d_sb;
+        struct inode *inode;
-        int rc = 0;
+        struct super_block *sb = parent->d_inode->i_sb;
-        cFYI(1, ("For %s", qstring->name));
+        cFYI(1, ("For %s", name->name));
-        qstring->hash = full_name_hash(qstring->name, qstring->len);
+        dentry = d_lookup(parent, name);
-        tmp_dentry = d_lookup(file->f_path.dentry, qstring);
+        if (dentry) {
-        if (tmp_dentry) {
+                /* FIXME: check for inode number changes? */
-                /* BB: overwrite old name? i.e. tmp_dentry->d_name and
+                if (dentry->d_inode != NULL)
-                 * tmp_dentry->d_name.len??
+                        return dentry;
-                 */
+                d_drop(dentry);
-                cFYI(0, ("existing dentry with inode 0x%p",
+                dput(dentry);
-                         tmp_dentry->d_inode));
-                *ptmp_inode = tmp_dentry->d_inode;
-                if (*ptmp_inode == NULL) {
-                        *ptmp_inode = cifs_new_inode(sb, inum);
-                        if (*ptmp_inode == NULL)
-                                return rc;
-                        rc = 1;
-                }
-        } else {
-                tmp_dentry = d_alloc(file->f_path.dentry, qstring);
-                if (tmp_dentry == NULL) {
-                        cERROR(1, ("Failed allocating dentry"));
-                        *ptmp_inode = NULL;
-                        return rc;
-                }
-                if (CIFS_SB(sb)->tcon->nocase)
-                        tmp_dentry->d_op = &cifs_ci_dentry_ops;
-                else
-                        tmp_dentry->d_op = &cifs_dentry_ops;
-                *ptmp_inode = cifs_new_inode(sb, inum);
-                if (*ptmp_inode == NULL)
-                        return rc;
-                rc = 2;
        }
-        tmp_dentry->d_time = jiffies;
+        dentry = d_alloc(parent, name);
-        *pnew_dentry = tmp_dentry;
+        if (dentry == NULL)
-        return rc;
+                return NULL;
-}
-static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode)
+        inode = cifs_iget(sb, fattr);
-{
+        if (!inode) {
-        if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
+                dput(dentry);
-                inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
+                return NULL;
-                inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
-                inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
        }
-        return;
-}
-static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
+        if (CIFS_SB(sb)->tcon->nocase)
-                          char *buf, unsigned int *pobject_type, int isNewInode)
+                dentry->d_op = &cifs_ci_dentry_ops;
-{
+        else
-        loff_t local_size;
+                dentry->d_op = &cifs_dentry_ops;
-        struct timespec local_mtime;
+        alias = d_materialise_unique(dentry, inode);
-        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
+        if (alias != NULL) {
-        struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
+                dput(dentry);
-        __u32 attr;
+                if (IS_ERR(alias))
-        __u64 allocation_size;
+                        return NULL;
-        __u64 end_of_file;
+                dentry = alias;
-        umode_t default_mode;
-        /* save mtime and size */
-        local_mtime = tmp_inode->i_mtime;
-        local_size  = tmp_inode->i_size;
-        if (new_buf_type) {
-                FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf;
-                attr = le32_to_cpu(pfindData->ExtFileAttributes);
-                allocation_size = le64_to_cpu(pfindData->AllocationSize);
-                end_of_file = le64_to_cpu(pfindData->EndOfFile);
-                tmp_inode->i_atime =
-                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
-                tmp_inode->i_mtime =
-                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-                tmp_inode->i_ctime =
-                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
-        } else { /* legacy, OS2 and DOS style */
-/*              struct timespec ts;*/
-                FIND_FILE_STANDARD_INFO *pfindData =
-                        (FIND_FILE_STANDARD_INFO *)buf;
-                tmp_inode->i_mtime = cnvrtDosUnixTm(
-                                le16_to_cpu(pfindData->LastWriteDate),
-                                le16_to_cpu(pfindData->LastWriteTime));
-                tmp_inode->i_atime = cnvrtDosUnixTm(
-                                le16_to_cpu(pfindData->LastAccessDate),
-                                le16_to_cpu(pfindData->LastAccessTime));
-                tmp_inode->i_ctime = cnvrtDosUnixTm(
-                                le16_to_cpu(pfindData->LastWriteDate),
-                                le16_to_cpu(pfindData->LastWriteTime));
-                AdjustForTZ(cifs_sb->tcon, tmp_inode);
-                attr = le16_to_cpu(pfindData->Attributes);
-                allocation_size = le32_to_cpu(pfindData->AllocationSize);
-                end_of_file = le32_to_cpu(pfindData->DataSize);
        }
-        /* Linux can not store file creation time unfortunately so ignore it */
+        return dentry;
+}
-        cifsInfo->cifsAttrs = attr;
+static void
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+{
-                /* get more accurate mode via ACL - so force inode refresh */
+        fattr->cf_uid = cifs_sb->mnt_uid;
-                cifsInfo->time = 0;
+        fattr->cf_gid = cifs_sb->mnt_gid;
-        } else
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
-                cifsInfo->time = jiffies;
-        /* treat dos attribute of read-only as read-only mode bit e.g. 555? */
-        /* 2767 perms - indicate mandatory locking */
-                /* BB fill in uid and gid here? with help from winbind?
-                   or retrieve from NTFS stream extended attribute */
-        if (atomic_read(&cifsInfo->inUse) == 0) {
-                tmp_inode->i_uid = cifs_sb->mnt_uid;
-                tmp_inode->i_gid = cifs_sb->mnt_gid;
-        }
-        if (attr & ATTR_DIRECTORY)
+        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
-                default_mode = cifs_sb->mnt_dir_mode;
+                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
-        else
+                fattr->cf_dtype = DT_DIR;
-                default_mode = cifs_sb->mnt_file_mode;
+        } else {
+                fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
-        /* set initial permissions */
+                fattr->cf_dtype = DT_REG;
-        if ((atomic_read(&cifsInfo->inUse) == 0) ||
-            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
-                tmp_inode->i_mode = default_mode;
-        else {
-                /* just reenable write bits if !ATTR_READONLY */
-                if ((tmp_inode->i_mode & S_IWUGO) == 0 &&
-                    (attr & ATTR_READONLY) == 0)
-                        tmp_inode->i_mode |= (S_IWUGO & default_mode);
-                tmp_inode->i_mode &= ~S_IFMT;
        }
-        /* clear write bits if ATTR_READONLY is set */
+        if (fattr->cf_cifsattrs & ATTR_READONLY)
-        if (attr & ATTR_READONLY)
+                fattr->cf_mode &= ~S_IWUGO;
-                tmp_inode->i_mode &= ~S_IWUGO;
-        /* set inode type */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
-        if ((attr & ATTR_SYSTEM) &&
+            fattr->cf_cifsattrs & ATTR_SYSTEM) {
-            (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
+                if (fattr->cf_eof == 0)  {
-                if (end_of_file == 0)  {
+                        fattr->cf_mode &= ~S_IFMT;
-                        tmp_inode->i_mode |= S_IFIFO;
+                        fattr->cf_mode |= S_IFIFO;
-                        *pobject_type = DT_FIFO;
+                        fattr->cf_dtype = DT_FIFO;
                } else {
                        /*
-                         * trying to get the type can be slow, so just call
+                         * trying to get the type and mode via SFU can be slow,
-                         * this a regular file for now, and mark for reval
+                         * so just call those regular files for now, and mark
+                         * for reval
                         */
-                        tmp_inode->i_mode |= S_IFREG;
+                        fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
-                        *pobject_type = DT_REG;
-                        cifsInfo->time = 0;
-                }
-        } else {
-                if (attr & ATTR_DIRECTORY) {
-                        tmp_inode->i_mode |= S_IFDIR;
-                        *pobject_type = DT_DIR;
-                } else {
-                        tmp_inode->i_mode |= S_IFREG;
-                        *pobject_type = DT_REG;
                }
        }
+}
-        /* can not fill in nlink here as in qpathinfo version and Unx search */
+void
-        if (atomic_read(&cifsInfo->inUse) == 0)
+cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
-                atomic_set(&cifsInfo->inUse, 1);
+                       struct cifs_sb_info *cifs_sb)
+{
-        cifsInfo->server_eof = end_of_file;
+        memset(fattr, 0, sizeof(*fattr));
-        spin_lock(&tmp_inode->i_lock);
+        fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
-        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
+        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
-                /* can not safely change the file size here if the
+        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
-                client is writing to it due to potential races */
+        fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
-                i_size_write(tmp_inode, end_of_file);
+        fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
+        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
-        /* 512 bytes (2**9) is the fake blocksize that must be used */
-        /* for this calculation, even though the reported blocksize is larger */
+        cifs_fill_common_info(fattr, cifs_sb);
-                tmp_inode->i_blocks = (512 - 1 + allocation_size) >> 9;
-        }
-        spin_unlock(&tmp_inode->i_lock);
-        if (allocation_size < end_of_file)
-                cFYI(1, ("May be sparse file, allocation less than file size"));
-        cFYI(1, ("File Size %ld and blocks %llu",
-                (unsigned long)tmp_inode->i_size,
-                (unsigned long long)tmp_inode->i_blocks));
-        if (S_ISREG(tmp_inode->i_mode)) {
-                cFYI(1, ("File inode"));
-                tmp_inode->i_op = &cifs_file_inode_ops;
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                                tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
-                        else
-                                tmp_inode->i_fop = &cifs_file_direct_ops;
-                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                        tmp_inode->i_fop = &cifs_file_nobrl_ops;
-                else
-                        tmp_inode->i_fop = &cifs_file_ops;
-                if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
-                   (cifs_sb->tcon->ses->server->maxBuf <
-                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
-                else
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
-                if (isNewInode)
-                        return; /* No sense invalidating pages for new inode
-                                   since have not started caching readahead file
-                                   data yet */
-                if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
-                        (local_size == tmp_inode->i_size)) {
-                        cFYI(1, ("inode exists but unchanged"));
-                } else {
-                        /* file may have changed on server */
-                        cFYI(1, ("invalidate inode, readdir detected change"));
-                        invalidate_remote_inode(tmp_inode);
-                }
-        } else if (S_ISDIR(tmp_inode->i_mode)) {
-                cFYI(1, ("Directory inode"));
-                tmp_inode->i_op = &cifs_dir_inode_ops;
-                tmp_inode->i_fop = &cifs_dir_ops;
-        } else if (S_ISLNK(tmp_inode->i_mode)) {
-                cFYI(1, ("Symbolic Link inode"));
-                tmp_inode->i_op = &cifs_symlink_inode_ops;
-        } else {
-                cFYI(1, ("Init special inode"));
-                init_special_inode(tmp_inode, tmp_inode->i_mode,
-                                   tmp_inode->i_rdev);
-        }
 }
-static void unix_fill_in_inode(struct inode *tmp_inode,
+void
-        FILE_UNIX_INFO *pfindData, unsigned int *pobject_type, int isNewInode)
+cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
+                       struct cifs_sb_info *cifs_sb)
 {
-        loff_t local_size;
+        int offset = cifs_sb->tcon->ses->server->timeAdj;
-        struct timespec local_mtime;
-        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
-        struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
-        __u32 type = le32_to_cpu(pfindData->Type);
-        __u64 num_of_bytes = le64_to_cpu(pfindData->NumOfBytes);
-        __u64 end_of_file = le64_to_cpu(pfindData->EndOfFile);
-        cifsInfo->time = jiffies;
-        atomic_inc(&cifsInfo->inUse);
-        /* save mtime and size */
-        local_mtime = tmp_inode->i_mtime;
-        local_size  = tmp_inode->i_size;
-        tmp_inode->i_atime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
-        tmp_inode->i_mtime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime));
-        tmp_inode->i_ctime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange));
-        tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
-        /* since we set the inode type below we need to mask off type
-           to avoid strange results if bits above were corrupt */
-        tmp_inode->i_mode &= ~S_IFMT;
-        if (type == UNIX_FILE) {
-                *pobject_type = DT_REG;
-                tmp_inode->i_mode |= S_IFREG;
-        } else if (type == UNIX_SYMLINK) {
-                *pobject_type = DT_LNK;
-                tmp_inode->i_mode |= S_IFLNK;
-        } else if (type == UNIX_DIR) {
-                *pobject_type = DT_DIR;
-                tmp_inode->i_mode |= S_IFDIR;
-        } else if (type == UNIX_CHARDEV) {
-                *pobject_type = DT_CHR;
-                tmp_inode->i_mode |= S_IFCHR;
-                tmp_inode->i_rdev = MKDEV(le64_to_cpu(pfindData->DevMajor),
-                                le64_to_cpu(pfindData->DevMinor) & MINORMASK);
-        } else if (type == UNIX_BLOCKDEV) {
-                *pobject_type = DT_BLK;
-                tmp_inode->i_mode |= S_IFBLK;
-                tmp_inode->i_rdev = MKDEV(le64_to_cpu(pfindData->DevMajor),
-                                le64_to_cpu(pfindData->DevMinor) & MINORMASK);
-        } else if (type == UNIX_FIFO) {
-                *pobject_type = DT_FIFO;
-                tmp_inode->i_mode |= S_IFIFO;
-        } else if (type == UNIX_SOCKET) {
-                *pobject_type = DT_SOCK;
-                tmp_inode->i_mode |= S_IFSOCK;
-        } else {
-                /* safest to just call it a file */
-                *pobject_type = DT_REG;
-                tmp_inode->i_mode |= S_IFREG;
-                cFYI(1, ("unknown inode type %d", type));
-        }
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+        memset(fattr, 0, sizeof(*fattr));
-                tmp_inode->i_uid = cifs_sb->mnt_uid;
+        fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
-        else
+                                            info->LastAccessTime, offset);
-                tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
+        fattr->cf_ctime = cnvrtDosUnixTm(info->LastWriteDate,
-        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+                                            info->LastWriteTime, offset);
-                tmp_inode->i_gid = cifs_sb->mnt_gid;
+        fattr->cf_mtime = cnvrtDosUnixTm(info->LastWriteDate,
-        else
+                                            info->LastWriteTime, offset);
-                tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
-        tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
-        cifsInfo->server_eof = end_of_file;
-        spin_lock(&tmp_inode->i_lock);
-        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
-                /* can not safely change the file size here if the
-                client is writing to it due to potential races */
-                i_size_write(tmp_inode, end_of_file);
-        /* 512 bytes (2**9) is the fake blocksize that must be used */
-        /* for this calculation, not the real blocksize */
-                tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
-        }
-        spin_unlock(&tmp_inode->i_lock);
-        if (S_ISREG(tmp_inode->i_mode)) {
+        fattr->cf_cifsattrs = le16_to_cpu(info->Attributes);
-                cFYI(1, ("File inode"));
+        fattr->cf_bytes = le32_to_cpu(info->AllocationSize);
-                tmp_inode->i_op = &cifs_file_inode_ops;
+        fattr->cf_eof = le32_to_cpu(info->DataSize);
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+        cifs_fill_common_info(fattr, cifs_sb);
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                                tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
-                        else
-                                tmp_inode->i_fop = &cifs_file_direct_ops;
-                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                        tmp_inode->i_fop = &cifs_file_nobrl_ops;
-                else
-                        tmp_inode->i_fop = &cifs_file_ops;
-                if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
-                   (cifs_sb->tcon->ses->server->maxBuf <
-                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
-                else
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
-                if (isNewInode)
-                        return; /* No sense invalidating pages for new inode
-                                   since we have not started caching readahead
-                                   file data for it yet */
-                if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
-                        (local_size == tmp_inode->i_size)) {
-                        cFYI(1, ("inode exists but unchanged"));
-                } else {
-                        /* file may have changed on server */
-                        cFYI(1, ("invalidate inode, readdir detected change"));
-                        invalidate_remote_inode(tmp_inode);
-                }
-        } else if (S_ISDIR(tmp_inode->i_mode)) {
-                cFYI(1, ("Directory inode"));
-                tmp_inode->i_op = &cifs_dir_inode_ops;
-                tmp_inode->i_fop = &cifs_dir_ops;
-        } else if (S_ISLNK(tmp_inode->i_mode)) {
-                cFYI(1, ("Symbolic Link inode"));
-                tmp_inode->i_op = &cifs_symlink_inode_ops;
-/* tmp_inode->i_fop = *//* do not need to set to anything */
-        } else {
-                cFYI(1, ("Special inode"));
-                init_special_inode(tmp_inode, tmp_inode->i_mode,
-                                   tmp_inode->i_rdev);
-        }
 }
 /* BB eventually need to add the following helper function to
@@ -884,7 +621,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                        len = strnlen(filename, PATH_MAX);
                }
-                *pinum = le64_to_cpu(pFindData->UniqueId);
+                *pinum = le64_to_cpu(pFindData->basic.UniqueId);
        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
                FILE_DIRECTORY_INFO *pFindData =
                        (FILE_DIRECTORY_INFO *)current_entry;
@@ -944,11 +681,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
        int rc = 0;
        struct qstr qstring;
        struct cifsFileInfo *pCifsF;
-        unsigned int obj_type;
+        u64    inum;
-        __u64  inum;
+        ino_t  ino;
+        struct super_block *sb;
        struct cifs_sb_info *cifs_sb;
-        struct inode *tmp_inode;
        struct dentry *tmp_dentry;
+        struct cifs_fattr fattr;
        /* get filename and len into qstring */
        /* get dentry */
@@ -966,60 +704,53 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
        if (rc != 0)
                return 0;
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        sb = file->f_path.dentry->d_sb;
+        cifs_sb = CIFS_SB(sb);
        qstring.name = scratch_buf;
        rc = cifs_get_name_from_search_buf(&qstring, pfindEntry,
                        pCifsF->srch_inf.info_level,
                        pCifsF->srch_inf.unicode, cifs_sb,
-                        max_len,
+                        max_len, &inum /* returned */);
-                        &inum /* returned */);
        if (rc)
                return rc;
-        /* only these two infolevels return valid inode numbers */
-        if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
-            pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
-                rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
-                                        &inum);
-        else
-                rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
-                                        NULL);
-        if ((tmp_inode == NULL) || (tmp_dentry == NULL))
-                return -ENOMEM;
-        /* we pass in rc below, indicating whether it is a new inode,
-           so we can figure out whether to invalidate the inode cached
-           data if the file has changed */
        if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX)
-                unix_fill_in_inode(tmp_inode,
+                cifs_unix_basic_to_fattr(&fattr,
-                                   (FILE_UNIX_INFO *)pfindEntry,
+                                 &((FILE_UNIX_INFO *) pfindEntry)->basic,
-                                   &obj_type, rc);
+                                 cifs_sb);
        else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD)
-                fill_in_inode(tmp_inode, 0 /* old level 1 buffer type */,
+                cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *)
-                                pfindEntry, &obj_type, rc);
+                                        pfindEntry, cifs_sb);
        else
-                fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc);
+                cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
+                                        pfindEntry, cifs_sb);
-        if (rc) /* new inode - needs to be tied to dentry */ {
+        /* FIXME: make _to_fattr functions fill this out */
-                d_instantiate(tmp_dentry, tmp_inode);
+        if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
-                if (rc == 2)
+                fattr.cf_uniqueid = inum;
-                        d_rehash(tmp_dentry);
+        else
-        }
+                fattr.cf_uniqueid = iunique(sb, ROOT_I);
+        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
+        tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
        rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
-                     tmp_inode->i_ino, obj_type);
+                     ino, fattr.cf_dtype);
+        /*
+         * we can not return filldir errors to the caller since they are
+         * "normal" when the stat blocksize is too small - we return remapped
+         * error instead
+         *
+         * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
+         * case already. Why should we be clobbering other errors from it?
+         */
        if (rc) {
                cFYI(1, ("filldir rc = %d", rc));
-                /* we can not return filldir errors to the caller
-                since they are "normal" when the stat blocksize
-                is too small - we return remapped error instead */
                rc = -EOVERFLOW;
        }
        dput(tmp_dentry);
        return rc;
 }
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 897a052270f9..7085a6275c4c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -802,7 +802,7 @@ ssetup_ntlmssp_authenticate:
 #endif /* CONFIG_CIFS_UPCALL */
        } else {
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-                if ((experimEnabled > 1) && (type == RawNTLMSSP)) {
+                if (type == RawNTLMSSP) {
                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
                                cERROR(1, ("NTLMSSP requires Unicode support"));
                                rc = -ENOSYS;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index e9527eedc639..a75afa3dd9e1 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -64,8 +64,9 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        if (ea_name == NULL) {
                cFYI(1, ("Null xattr names not supported"));
@@ -118,8 +119,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -225,8 +227,9 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -351,8 +354,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6a347fbc998a..ffd42815fda1 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
                      struct pipe_inode_info *pipe, size_t count,
                      unsigned int flags)
 {
+        ssize_t (*splice_read)(struct file *, loff_t *,
+                               struct pipe_inode_info *, size_t, unsigned int);
        struct coda_file_info *cfi;
        struct file *host_file;
@@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
-        if (!host_file->f_op || !host_file->f_op->splice_read)
+        splice_read = host_file->f_op->splice_read;
-                return -EINVAL;
+        if (!splice_read)
+                splice_read = default_file_splice_read;
-        return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags);
+        return splice_read(host_file, ppos, pipe, count, flags);
 }
 static ssize_t
diff --git a/fs/compat.c b/fs/compat.c
index 681ed81e6be0..94502dab972a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -32,7 +32,6 @@
 #include <linux/smb_mount.h>
 #include <linux/ncp_mount.h>
 #include <linux/nfs4_mount.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
 #include <linux/module.h>
@@ -471,7 +470,7 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
                ret = sys_fcntl(fd, cmd, (unsigned long)&f);
                set_fs(old_fs);
                if (cmd == F_GETLK && ret == 0) {
-                        /* GETLK was successfule and we need to return the data...
+                        /* GETLK was successful and we need to return the data...
                         * but it needs to fit in the compat structure.
                         * l_start shouldn't be too big, unless the original
                         * start + end is greater than COMPAT_OFF_T_MAX, in which
@@ -812,10 +811,8 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
                }
        }
-        lock_kernel();
        retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
                        flags, (void*)data_page);
-        unlock_kernel();
 out4:
        free_page(data_page);
@@ -1488,8 +1485,8 @@ int compat_do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+        retval = -ERESTARTNOINTR;
-        if (retval < 0)
+        if (mutex_lock_interruptible(&current->cred_guard_mutex))
                goto out_free;
        current->in_execve = 1;
@@ -1550,7 +1547,7 @@ int compat_do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1573,7 +1570,7 @@ out_unmark:
 out_unlock:
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
 out_free:
        free_bprm(bprm);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index b83f6bcfa51a..f91fd51b32e3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,6 +19,7 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/smp_lock.h>
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
@@ -31,6 +32,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/vt.h>
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/ppp_defs.h>
@@ -94,7 +96,6 @@
 #include <linux/atm_tcp.h>
 #include <linux/sonet.h>
 #include <linux/atm_suni.h>
-#include <linux/mtd/mtd.h>
 #include <linux/usb.h>
 #include <linux/usbdevice_fs.h>
@@ -788,12 +789,6 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
        if (put_user(compat_ptr(data), &sgio->usr_ptr))
                return -EFAULT;
-        if (copy_in_user(&sgio->status, &sgio32->status,
-                         (4 * sizeof(unsigned char)) +
-                         (2 * sizeof(unsigned short)) +
-                         (3 * sizeof(int))))
-                return -EFAULT;
        err = sys_ioctl(fd, cmd, (unsigned long) sgio);
        if (err >= 0) {
@@ -1411,46 +1406,6 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
 #define HIDPGETCONNLIST _IOR('H', 210, int)
 #define HIDPGETCONNINFO _IOR('H', 211, int)
-struct mtd_oob_buf32 {
-        u_int32_t start;
-        u_int32_t length;
-        compat_caddr_t ptr;     /* unsigned char* */
-};
-#define MEMWRITEOOB32   _IOWR('M',3,struct mtd_oob_buf32)
-#define MEMREADOOB32    _IOWR('M',4,struct mtd_oob_buf32)
-static int mtd_rw_oob(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct mtd_oob_buf __user *buf = compat_alloc_user_space(sizeof(*buf));
-        struct mtd_oob_buf32 __user *buf32 = compat_ptr(arg);
-        u32 data;
-        char __user *datap;
-        unsigned int real_cmd;
-        int err;
-        real_cmd = (cmd == MEMREADOOB32) ?
-                MEMREADOOB : MEMWRITEOOB;
-        if (copy_in_user(&buf->start, &buf32->start,
-                         2 * sizeof(u32)) ||
-            get_user(data, &buf32->ptr))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(datap, &buf->ptr))
-                return -EFAULT;
-        err = sys_ioctl(fd, real_cmd, (unsigned long) buf);
-        if (!err) {
-                if (copy_in_user(&buf32->start, &buf->start,
-                                 2 * sizeof(u32)))
-                        err = -EFAULT;
-        }
-        return err;
-}       
 #ifdef CONFIG_BLOCK
 struct raw32_config_request
 {
@@ -1765,7 +1720,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
 /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
 * for some operations; this forces use of the newer bridge-utils that
- * use compatiable ioctls
+ * use compatible ioctls
 */
 static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
@@ -1826,6 +1781,41 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
        return sys_ioctl(fd, cmd, (unsigned long)tn);
 }
+/* on ia32 l_start is on a 32-bit boundary */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+struct space_resv_32 {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start __attribute__((packed));
+                        /* len == 0 means until end of file */
+        __s64           l_len __attribute__((packed));
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area */
+};
+#define FS_IOC_RESVSP_32                _IOW ('X', 40, struct space_resv_32)
+#define FS_IOC_RESVSP64_32      _IOW ('X', 42, struct space_resv_32)
+/* just account for different alignment */
+static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
+{
+        struct space_resv_32    __user *p32 = (void __user *)arg;
+        struct space_resv       __user *p = compat_alloc_user_space(sizeof(*p));
+        if (copy_in_user(&p->l_type,    &p32->l_type,   sizeof(s16)) ||
+            copy_in_user(&p->l_whence,  &p32->l_whence, sizeof(s16)) ||
+            copy_in_user(&p->l_start,   &p32->l_start,  sizeof(s64)) ||
+            copy_in_user(&p->l_len,     &p32->l_len,    sizeof(s64)) ||
+            copy_in_user(&p->l_sysid,   &p32->l_sysid,  sizeof(s32)) ||
+            copy_in_user(&p->l_pid,     &p32->l_pid,    sizeof(u32)) ||
+            copy_in_user(&p->l_pad,     &p32->l_pad,    4*sizeof(u32)))
+                return -EFAULT;
+        return ioctl_preallocate(file, p);
+}
+#endif
 typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int,
                                        unsigned long, struct file *);
@@ -1915,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX)
 COMPATIBLE_IOCTL(FIOASYNC)
 COMPATIBLE_IOCTL(FIONBIO)
 COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
+COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
@@ -2432,15 +2423,6 @@ COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32)
 COMPATIBLE_IOCTL(USBDEVFS_REAPURB32)
 COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32)
 COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT)
-/* MTD */
-COMPATIBLE_IOCTL(MEMGETINFO)
-COMPATIBLE_IOCTL(MEMERASE)
-COMPATIBLE_IOCTL(MEMLOCK)
-COMPATIBLE_IOCTL(MEMUNLOCK)
-COMPATIBLE_IOCTL(MEMGETREGIONCOUNT)
-COMPATIBLE_IOCTL(MEMGETREGIONINFO)
-COMPATIBLE_IOCTL(MEMGETBADBLOCK)
-COMPATIBLE_IOCTL(MEMSETBADBLOCK)
 /* NBD */
 ULONG_IOCTL(NBD_SET_SOCK)
 ULONG_IOCTL(NBD_SET_BLKSIZE)
@@ -2550,8 +2532,6 @@ COMPATIBLE_IOCTL(JSIOCGBUTTONS)
 COMPATIBLE_IOCTL(JSIOCGNAME(0))
 /* now things that need handlers */
-HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
-HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
 #ifdef CONFIG_NET
 HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32)
 HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf)
@@ -2814,6 +2794,18 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
        case FIOQSIZE:
                break;
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+        case FS_IOC_RESVSP_32:
+        case FS_IOC_RESVSP64_32:
+                error = compat_ioctl_preallocate(filp, arg);
+                goto out_fput;
+#else
+        case FS_IOC_RESVSP:
+        case FS_IOC_RESVSP64:
+                error = ioctl_preallocate(filp, (void __user *)arg);
+                goto out_fput;
+#endif
        case FIBMAP:
        case FIGETBSZ:
        case FIONREAD:
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 762d287123ca..da6061a6df40 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -39,6 +39,9 @@ struct configfs_dirent {
        umode_t                 s_mode;
        struct dentry           * s_dentry;
        struct iattr            * s_iattr;
+#ifdef CONFIG_LOCKDEP
+        int                     s_depth;
+#endif
 };
 #define CONFIGFS_ROOT           0x0001
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 05373db21a4e..8e48b52205aa 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -78,11 +78,97 @@ static const struct dentry_operations configfs_dentry_ops = {
        .d_delete       = configfs_d_delete,
 };
+#ifdef CONFIG_LOCKDEP
+/*
+ * Helpers to make lockdep happy with our recursive locking of default groups'
+ * inodes (see configfs_attach_group() and configfs_detach_group()).
+ * We put default groups i_mutexes in separate classes according to their depth
+ * from the youngest non-default group ancestor.
+ *
+ * For a non-default group A having default groups A/B, A/C, and A/C/D, default
+ * groups A/B and A/C will have their inode's mutex in class
+ * default_group_class[0], and default group A/C/D will be in
+ * default_group_class[1].
+ *
+ * The lock classes are declared and assigned in inode.c, according to the
+ * s_depth value.
+ * The s_depth value is initialized to -1, adjusted to >= 0 when attaching
+ * default groups, and reset to -1 when all default groups are attached. During
+ * attachment, if configfs_create() sees s_depth > 0, the lock class of the new
+ * inode's mutex is set to default_group_class[s_depth - 1].
+ */
+static void configfs_init_dirent_depth(struct configfs_dirent *sd)
+{
+        sd->s_depth = -1;
+}
+static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
+                                          struct configfs_dirent *sd)
+{
+        int parent_depth = parent_sd->s_depth;
+        if (parent_depth >= 0)
+                sd->s_depth = parent_depth + 1;
+}
+static void
+configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
+{
+        /*
+         * item's i_mutex class is already setup, so s_depth is now only
+         * used to set new sub-directories s_depth, which is always done
+         * with item's i_mutex locked.
+         */
+        /*
+         *  sd->s_depth == -1 iff we are a non default group.
+         *  else (we are a default group) sd->s_depth > 0 (see
+         *  create_dir()).
+         */
+        if (sd->s_depth == -1)
+                /*
+                 * We are a non default group and we are going to create
+                 * default groups.
+                 */
+                sd->s_depth = 0;
+}
+static void
+configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
+{
+        /* We will not create default groups anymore. */
+        sd->s_depth = -1;
+}
+#else /* CONFIG_LOCKDEP */
+static void configfs_init_dirent_depth(struct configfs_dirent *sd)
+{
+}
+static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
+                                          struct configfs_dirent *sd)
+{
+}
+static void
+configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
+{
+}
+static void
+configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
+{
+}
+#endif /* CONFIG_LOCKDEP */
 /*
 * Allocates a new configfs_dirent and links it to the parent configfs_dirent
 */
-static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
-                                                void * element)
+                                                   void *element, int type)
 {
        struct configfs_dirent * sd;
@@ -94,6 +180,8 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
        INIT_LIST_HEAD(&sd->s_links);
        INIT_LIST_HEAD(&sd->s_children);
        sd->s_element = element;
+        sd->s_type = type;
+        configfs_init_dirent_depth(sd);
        spin_lock(&configfs_dirent_lock);
        if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
                spin_unlock(&configfs_dirent_lock);
@@ -138,12 +226,11 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
 {
        struct configfs_dirent * sd;
-        sd = configfs_new_dirent(parent_sd, element);
+        sd = configfs_new_dirent(parent_sd, element, type);
        if (IS_ERR(sd))
                return PTR_ERR(sd);
        sd->s_mode = mode;
-        sd->s_type = type;
        sd->s_dentry = dentry;
        if (dentry) {
                dentry->d_fsdata = configfs_get(sd);
@@ -187,6 +274,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
                error = configfs_make_dirent(p->d_fsdata, d, k, mode,
                                             CONFIGFS_DIR | CONFIGFS_USET_CREATING);
        if (!error) {
+                configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);
                error = configfs_create(d, mode, init_dir);
                if (!error) {
                        inc_nlink(p->d_inode);
@@ -789,11 +877,13 @@ static int configfs_attach_group(struct config_item *parent_item,
                 * error, as rmdir() would.
                 */
                mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+                configfs_adjust_dir_dirent_depth_before_populate(sd);
                ret = populate_groups(to_config_group(item));
                if (ret) {
                        configfs_detach_item(item);
                        dentry->d_inode->i_flags |= S_DEAD;
                }
+                configfs_adjust_dir_dirent_depth_after_populate(sd);
                mutex_unlock(&dentry->d_inode->i_mutex);
                if (ret)
                        d_delete(dentry);
@@ -916,11 +1006,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
 * Note, btw, that this can be called at *any* time, even when a configfs
 * subsystem isn't registered, or when configfs is loading or unloading.
 * Just like configfs_register_subsystem().  So we take the same
- * precautions.  We pin the filesystem.  We lock each i_mutex _in_order_
+ * precautions.  We pin the filesystem.  We lock configfs_dirent_lock.
- * on our way down the tree.  If we can find the target item in the
+ * If we can find the target item in the
 * configfs tree, it must be part of the subsystem tree as well, so we
- * do not need the subsystem semaphore.  Holding the i_mutex chain locks
+ * do not need the subsystem semaphore.  Holding configfs_dirent_lock helps
- * out mkdir() and rmdir(), who might be racing us.
+ * locking out mkdir() and rmdir(), who might be racing us.
 */
 /*
@@ -933,17 +1023,21 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
 * do that so we can unlock it if we find nothing.
 *
 * Here we do a depth-first search of the dentry hierarchy looking for
- * our object.  We take i_mutex on each step of the way down.  IT IS
+ * our object.
- * ESSENTIAL THAT i_mutex LOCKING IS ORDERED.  If we come back up a branch,
+ * We deliberately ignore items tagged as dropping since they are virtually
- * we'll drop the i_mutex.
+ * dead, as well as items in the middle of attachment since they virtually
+ * do not exist yet. This completes the locking out of racing mkdir() and
+ * rmdir().
+ * Note: subdirectories in the middle of attachment start with s_type =
+ * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir().  When
+ * CONFIGFS_USET_CREATING is set, we ignore the item.  The actual set of
+ * s_type is in configfs_new_dirent(), which has configfs_dirent_lock.
 *
- * If the target is not found, -ENOENT is bubbled up and we have released
+ * If the target is not found, -ENOENT is bubbled up.
- * all locks.  If the target was found, the locks will be cleared by
- * configfs_depend_rollback().
 *
 * This adds a requirement that all config_items be unique!
 *
- * This is recursive because the locking traversal is tricky.  There isn't
+ * This is recursive.  There isn't
 * much on the stack, though, so folks that need this function - be careful
 * about your stack!  Patches will be accepted to make it iterative.
 */
@@ -955,13 +1049,13 @@ static int configfs_depend_prep(struct dentry *origin,
        BUG_ON(!origin || !sd);
-        /* Lock this guy on the way down */
-        mutex_lock(&sd->s_dentry->d_inode->i_mutex);
        if (sd->s_element == target)  /* Boo-yah */
                goto out;
        list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
-                if (child_sd->s_type & CONFIGFS_DIR) {
+                if ((child_sd->s_type & CONFIGFS_DIR) &&
+                    !(child_sd->s_type & CONFIGFS_USET_DROPPING) &&
+                    !(child_sd->s_type & CONFIGFS_USET_CREATING)) {
                        ret = configfs_depend_prep(child_sd->s_dentry,
                                                   target);
                        if (!ret)
@@ -970,33 +1064,12 @@ static int configfs_depend_prep(struct dentry *origin,
        }
        /* We looped all our children and didn't find target */
-        mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
        ret = -ENOENT;
 out:
        return ret;
 }
-/*
- * This is ONLY called if configfs_depend_prep() did its job.  So we can
- * trust the entire path from item back up to origin.
- *
- * We walk backwards from item, unlocking each i_mutex.  We finish by
- * unlocking origin.
- */
-static void configfs_depend_rollback(struct dentry *origin,
-                                     struct config_item *item)
-{
-        struct dentry *dentry = item->ci_dentry;
-        while (dentry != origin) {
-                mutex_unlock(&dentry->d_inode->i_mutex);
-                dentry = dentry->d_parent;
-        }
-        mutex_unlock(&origin->d_inode->i_mutex);
-}
 int configfs_depend_item(struct configfs_subsystem *subsys,
                         struct config_item *target)
 {
@@ -1037,17 +1110,21 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
        /* Ok, now we can trust subsys/s_item */
-        /* Scan the tree, locking i_mutex recursively, return 0 if found */
+        spin_lock(&configfs_dirent_lock);
+        /* Scan the tree, return 0 if found */
        ret = configfs_depend_prep(subsys_sd->s_dentry, target);
        if (ret)
-                goto out_unlock_fs;
+                goto out_unlock_dirent_lock;
-        /* We hold all i_mutexes from the subsystem down to the target */
+        /*
+         * We are sure that the item is not about to be removed by rmdir(), and
+         * not in the middle of attachment by mkdir().
+         */
        p = target->ci_dentry->d_fsdata;
        p->s_dependent_count += 1;
-        configfs_depend_rollback(subsys_sd->s_dentry, target);
+out_unlock_dirent_lock:
+        spin_unlock(&configfs_dirent_lock);
 out_unlock_fs:
        mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
@@ -1072,10 +1149,10 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
        struct configfs_dirent *sd;
        /*
-         * Since we can trust everything is pinned, we just need i_mutex
+         * Since we can trust everything is pinned, we just need
-         * on the item.
+         * configfs_dirent_lock.
         */
-        mutex_lock(&target->ci_dentry->d_inode->i_mutex);
+        spin_lock(&configfs_dirent_lock);
        sd = target->ci_dentry->d_fsdata;
        BUG_ON(sd->s_dependent_count < 1);
@@ -1086,7 +1163,7 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
         * After this unlock, we cannot trust the item to stay alive!
         * DO NOT REFERENCE item after this unlock.
         */
-        mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
+        spin_unlock(&configfs_dirent_lock);
 }
 EXPORT_SYMBOL(configfs_undepend_item);
@@ -1286,13 +1363,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (sd->s_type & CONFIGFS_USET_DEFAULT)
                return -EPERM;
-        /*
-         * Here's where we check for dependents.  We're protected by
-         * i_mutex.
-         */
-        if (sd->s_dependent_count)
-                return -EBUSY;
        /* Get a working ref until we have the child */
        parent_item = configfs_get_config_item(dentry->d_parent);
        subsys = to_config_group(parent_item)->cg_subsys;
@@ -1316,9 +1386,17 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
                mutex_lock(&configfs_symlink_mutex);
                spin_lock(&configfs_dirent_lock);
-                ret = configfs_detach_prep(dentry, &wait_mutex);
+                /*
-                if (ret)
+                 * Here's where we check for dependents.  We're protected by
-                        configfs_detach_rollback(dentry);
+                 * configfs_dirent_lock.
+                 * If no dependent, atomically tag the item as dropping.
+                 */
+                ret = sd->s_dependent_count ? -EBUSY : 0;
+                if (!ret) {
+                        ret = configfs_detach_prep(dentry, &wait_mutex);
+                        if (ret)
+                                configfs_detach_rollback(dentry);
+                }
                spin_unlock(&configfs_dirent_lock);
                mutex_unlock(&configfs_symlink_mutex);
@@ -1429,7 +1507,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
         */
        err = -ENOENT;
        if (configfs_dirent_is_ready(parent_sd)) {
-                file->private_data = configfs_new_dirent(parent_sd, NULL);
+                file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
                if (IS_ERR(file->private_data))
                        err = PTR_ERR(file->private_data);
                else
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5d349d38e056..4921e7426d95 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -33,10 +33,15 @@
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
 #include <linux/sched.h>
+#include <linux/lockdep.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
+#endif
 extern struct super_block * configfs_sb;
 static const struct address_space_operations configfs_aops = {
@@ -150,6 +155,38 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
        return inode;
 }
+#ifdef CONFIG_LOCKDEP
+static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
+                                          struct inode *inode)
+{
+        int depth = sd->s_depth;
+        if (depth > 0) {
+                if (depth <= ARRAY_SIZE(default_group_class)) {
+                        lockdep_set_class(&inode->i_mutex,
+                                          &default_group_class[depth - 1]);
+                } else {
+                        /*
+                         * In practice the maximum level of locking depth is
+                         * already reached. Just inform about possible reasons.
+                         */
+                        printk(KERN_INFO "configfs: Too many levels of inodes"
+                               " for the locking correctness validator.\n");
+                        printk(KERN_INFO "Spurious warnings may appear.\n");
+                }
+        }
+}
+#else /* CONFIG_LOCKDEP */
+static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
+                                          struct inode *inode)
+{
+}
+#endif /* CONFIG_LOCKDEP */
 int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
 {
        int error = 0;
@@ -162,6 +199,7 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
                                        struct inode *p_inode = dentry->d_parent->d_inode;
                                        p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
                                }
+                                configfs_set_inode_lock_class(sd, inode);
                                goto Proceed;
                        }
                        else
diff --git a/fs/dcache.c b/fs/dcache.c
index 75659a6fd1f8..9e5cd3c3a6ba 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root,
        spin_lock(&vfsmount_lock);
        prepend(&end, &buflen, "\0", 1);
-        if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+        if (d_unlinked(dentry) &&
                (prepend(&end, &buflen, " (deleted)", 10) != 0))
                        goto Elong;
@@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
        spin_lock(&dcache_lock);
        prepend(&end, &buflen, "\0", 1);
-        if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+        if (d_unlinked(dentry) &&
                (prepend(&end, &buflen, "//deleted", 9) != 0))
                        goto Elong;
        if (buflen < 1)
@@ -2097,9 +2097,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        read_unlock(&current->fs->lock);
        error = -ENOENT;
-        /* Has the current directory has been unlinked? */
        spin_lock(&dcache_lock);
-        if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
+        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
                char * cwd;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 33a90120f6ad..4d74fc72c195 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -67,6 +67,8 @@ static int debugfs_u8_get(void *data, u64 *val)
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
 /**
 * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
@@ -95,6 +97,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
 struct dentry *debugfs_create_u8(const char *name, mode_t mode,
                                 struct dentry *parent, u8 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_u8);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -110,6 +119,8 @@ static int debugfs_u16_get(void *data, u64 *val)
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
 /**
 * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
@@ -138,6 +149,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
 struct dentry *debugfs_create_u16(const char *name, mode_t mode,
                                  struct dentry *parent, u16 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_u16);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -153,6 +171,8 @@ static int debugfs_u32_get(void *data, u64 *val)
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
 /**
 * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
@@ -181,6 +201,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
 struct dentry *debugfs_create_u32(const char *name, mode_t mode,
                                 struct dentry *parent, u32 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_u32);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -197,6 +224,8 @@ static int debugfs_u64_get(void *data, u64 *val)
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
 /**
 * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
@@ -225,15 +254,28 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
 struct dentry *debugfs_create_u64(const char *name, mode_t mode,
                                 struct dentry *parent, u64 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_u64);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u64);
 DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
 /*
 * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
@@ -256,6 +298,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
 struct dentry *debugfs_create_x8(const char *name, mode_t mode,
                                 struct dentry *parent, u8 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_x8);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -273,6 +322,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
 struct dentry *debugfs_create_x16(const char *name, mode_t mode,
                                 struct dentry *parent, u16 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_x16);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -290,6 +346,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
 struct dentry *debugfs_create_x32(const char *name, mode_t mode,
                                 struct dentry *parent, u32 *value)
 {
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
        return debugfs_create_file(name, mode, parent, value, &fops_x32);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -419,7 +482,7 @@ static const struct file_operations fops_blob = {
 };
 /**
- * debugfs_create_blob - create a debugfs file that is used to read and write a binary blob
+ * debugfs_create_blob - create a debugfs file that is used to read a binary blob
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 0662ba6de85a..d22438ef7674 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -403,6 +403,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
                }
                child = list_entry(parent->d_subdirs.next, struct dentry,
                                d_u.d_child);
+ next_sibling:
                /*
                 * If "child" isn't empty, walk down the tree and
@@ -417,6 +418,16 @@ void debugfs_remove_recursive(struct dentry *dentry)
                __debugfs_remove(child, parent);
                if (parent->d_subdirs.next == &child->d_u.d_child) {
                        /*
+                         * Try the next sibling.
+                         */
+                        if (child->d_u.d_child.next != &parent->d_subdirs) {
+                                child = list_entry(child->d_u.d_child.next,
+                                                   struct dentry,
+                                                   d_u.d_child);
+                                goto next_sibling;
+                        }
+                        /*
                         * Avoid infinite loop if we fail to remove
                         * one dentry.
                         */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c68edb969441..75efb028974b 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -423,7 +423,6 @@ static void devpts_kill_sb(struct super_block *sb)
 }
 static struct file_system_type devpts_fs_type = {
-        .owner          = THIS_MODULE,
        .name           = "devpts",
        .get_sb         = devpts_get_sb,
        .kill_sb        = devpts_kill_sb,
@@ -557,18 +556,11 @@ static int __init init_devpts_fs(void)
        int err = register_filesystem(&devpts_fs_type);
        if (!err) {
                devpts_mnt = kern_mount(&devpts_fs_type);
-                if (IS_ERR(devpts_mnt))
+                if (IS_ERR(devpts_mnt)) {
                        err = PTR_ERR(devpts_mnt);
+                        unregister_filesystem(&devpts_fs_type);
+                }
        }
        return err;
 }
-static void __exit exit_devpts_fs(void)
-{
-        unregister_filesystem(&devpts_fs_type);
-        mntput(devpts_mnt);
-}
 module_init(init_devpts_fs)
-module_exit(exit_devpts_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bbc2050..8b10b87dc01a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                rw = WRITE_ODIRECT;
        if (bdev)
-                bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+                bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
        if (offset & blocksize_mask) {
                if (bdev)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 858fba14aaa6..c4dfa1dcc86f 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,8 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
        spin_unlock(&ls->ls_recover_list_lock);
        if (!found)
-                de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
+                de = kzalloc(sizeof(struct dlm_direntry) + len,
+                             ls->ls_allocation);
        return de;
 }
@@ -211,7 +212,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
        dlm_dir_clear(ls);
-        last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+        last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
        if (!last_name)
                goto out;
@@ -322,7 +323,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
        if (namelen > DLM_RESNAME_MAXLEN)
                return -EINVAL;
-        de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
+        de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
        if (!de)
                return -ENOMEM;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 205ec95b347e..eb507c453c5f 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -435,7 +435,7 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
 static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
                    unsigned int flags, struct dlm_rsb **r_ret)
 {
-        struct dlm_rsb *r, *tmp;
+        struct dlm_rsb *r = NULL, *tmp;
        uint32_t hash, bucket;
        int error = -EINVAL;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index cd8e2df3c295..d489fcc86713 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -384,7 +384,7 @@ static void threads_stop(void)
        dlm_astd_stop();
 }
-static int new_lockspace(char *name, int namelen, void **lockspace,
+static int new_lockspace(const char *name, int namelen, void **lockspace,
                         uint32_t flags, int lvblen)
 {
        struct dlm_ls *ls;
@@ -419,16 +419,14 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
                        break;
                }
                ls->ls_create_count++;
-                module_put(THIS_MODULE);
+                *lockspace = ls;
-                error = 1; /* not an error, return 0 */
+                error = 1;
                break;
        }
        spin_unlock(&lslist_lock);
-        if (error < 0)
-                goto out;
        if (error)
-                goto ret_zero;
+                goto out;
        error = -ENOMEM;
@@ -583,7 +581,6 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        dlm_create_debug_file(ls);
        log_debug(ls, "join complete");
- ret_zero:
        *lockspace = ls;
        return 0;
@@ -614,7 +611,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        return error;
 }
-int dlm_new_lockspace(char *name, int namelen, void **lockspace,
+int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
                      uint32_t flags, int lvblen)
 {
        int error = 0;
@@ -628,7 +625,9 @@ int dlm_new_lockspace(char *name, int namelen, void **lockspace,
        error = new_lockspace(name, namelen, lockspace, flags, lvblen);
        if (!error)
                ls_count++;
-        else if (!ls_count)
+        if (error > 0)
+                error = 0;
+        if (!ls_count)
                threads_stop();
 out:
        mutex_unlock(&ls_lock);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 609108a83267..618a60f03886 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -309,6 +309,20 @@ static void lowcomms_state_change(struct sock *sk)
                lowcomms_write_space(sk);
 }
+int dlm_lowcomms_connect_node(int nodeid)
+{
+        struct connection *con;
+        if (nodeid == dlm_our_nodeid())
+                return 0;
+        con = nodeid2con(nodeid, GFP_NOFS);
+        if (!con)
+                return -ENOMEM;
+        lowcomms_connect_sock(con);
+        return 0;
+}
 /* Make a socket active */
 static int add_sock(struct socket *sock, struct connection *con)
 {
@@ -486,7 +500,7 @@ static void process_sctp_notification(struct connection *con,
                                return;
                        }
-                        new_con = nodeid2con(nodeid, GFP_KERNEL);
+                        new_con = nodeid2con(nodeid, GFP_NOFS);
                        if (!new_con)
                                return;
@@ -722,7 +736,7 @@ static int tcp_accept_from_sock(struct connection *con)
         *  the same time and the connections cross on the wire.
         *  In this case we store the incoming one in "othercon"
         */
-        newcon = nodeid2con(nodeid, GFP_KERNEL);
+        newcon = nodeid2con(nodeid, GFP_NOFS);
        if (!newcon) {
                result = -ENOMEM;
                goto accept_err;
@@ -732,7 +746,7 @@ static int tcp_accept_from_sock(struct connection *con)
                struct connection *othercon = newcon->othercon;
                if (!othercon) {
-                        othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
+                        othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
                        if (!othercon) {
                                log_print("failed to allocate incoming socket");
                                mutex_unlock(&newcon->sock_mutex);
@@ -888,7 +902,7 @@ static void tcp_connect_to_sock(struct connection *con)
        int result = -EHOSTUNREACH;
        struct sockaddr_storage saddr, src_addr;
        int addr_len;
-        struct socket *sock;
+        struct socket *sock = NULL;
        if (con->nodeid == 0) {
                log_print("attempt to connect sock 0 foiled");
@@ -948,6 +962,8 @@ out_err:
        if (con->sock) {
                sock_release(con->sock);
                con->sock = NULL;
+        } else if (sock) {
+                sock_release(sock);
        }
        /*
         * Some errors are fatal and this list might need adjusting. For other
@@ -1421,7 +1437,7 @@ static int work_start(void)
 static void stop_conn(struct connection *con)
 {
        con->flags |= 0x0F;
-        if (con->sock)
+        if (con->sock && con->sock->sk)
                con->sock->sk->sk_user_data = NULL;
 }
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index a9a9618c0d3f..1311e6426287 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_lowcomms_stop(void);
 int dlm_lowcomms_close(int nodeid);
 void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
 void dlm_lowcomms_commit_buffer(void *mh);
+int dlm_lowcomms_connect_node(int nodeid);
 #endif                          /* __LOWCOMMS_DOT_H__ */
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 26133f05ae3a..b128775913b2 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
 #include "recover.h"
 #include "rcom.h"
 #include "config.h"
+#include "lowcomms.h"
 static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
 {
@@ -45,9 +46,9 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
 static int dlm_add_member(struct dlm_ls *ls, int nodeid)
 {
        struct dlm_member *memb;
-        int w;
+        int w, error;
-        memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+        memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
        if (!memb)
                return -ENOMEM;
@@ -57,6 +58,12 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
                return w;
        }
+        error = dlm_lowcomms_connect_node(nodeid);
+        if (error < 0) {
+                kfree(memb);
+                return error;
+        }
        memb->nodeid = nodeid;
        memb->weight = w;
        add_ordered_member(ls, memb);
@@ -136,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
        ls->ls_total_weight = total;
-        array = kmalloc(sizeof(int) * total, GFP_KERNEL);
+        array = kmalloc(sizeof(int) * total, ls->ls_allocation);
        if (!array)
                return;
@@ -219,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
                        continue;
                log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
-                memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+                memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
                if (!memb)
                        return -ENOMEM;
                memb->nodeid = rv->new[i];
@@ -334,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
        int *ids = NULL, *new = NULL;
        int error, ids_count = 0, new_count = 0;
-        rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
+        rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
        if (!rv)
                return -ENOMEM;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 894a32d438d5..16f682e26c07 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -353,7 +353,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
 {
        struct dlm_plock_info info;
        struct plock_op *op;
-        int found = 0;
+        int found = 0, do_callback = 0;
        if (count != sizeof(info))
                return -EINVAL;
@@ -366,21 +366,24 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
        spin_lock(&ops_lock);
        list_for_each_entry(op, &recv_list, list) {
-                if (op->info.fsid == info.fsid && op->info.number == info.number &&
+                if (op->info.fsid == info.fsid &&
+                    op->info.number == info.number &&
                    op->info.owner == info.owner) {
+                        struct plock_xop *xop = (struct plock_xop *)op;
                        list_del_init(&op->list);
-                        found = 1;
-                        op->done = 1;
                        memcpy(&op->info, &info, sizeof(info));
+                        if (xop->callback)
+                                do_callback = 1;
+                        else
+                                op->done = 1;
+                        found = 1;
                        break;
                }
        }
        spin_unlock(&ops_lock);
        if (found) {
-                struct plock_xop *xop;
+                if (do_callback)
-                xop = (struct plock_xop *)op;
-                if (xop->callback)
                        dlm_plock_callback(op);
                else
                        wake_up(&recv_wq);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index daa4183fbb84..7a2307c08911 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
        struct rq_entry *e;
        int length = ms->m_header.h_length - sizeof(struct dlm_message);
-        e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+        e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
        if (!e) {
                log_print("dlm_add_requestqueue: out of memory len %d", length);
                return;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index b6a719a909f8..a2edb7913447 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -24,7 +24,7 @@ static void drop_pagecache_sb(struct super_block *sb)
                        continue;
                __iget(inode);
                spin_unlock(&inode_lock);
-                __invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
+                invalidate_mapping_pages(inode->i_mapping, 0, -1);
                iput(toput_inode);
                toput_inode = inode;
                spin_lock(&inode_lock);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb7..259525c9abb8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
        }
        (*new_auth_tok)->session_key.encrypted_key_size =
                (body_size - (ECRYPTFS_SALT_SIZE + 5));
+        if ((*new_auth_tok)->session_key.encrypted_key_size
+            > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+                printk(KERN_WARNING "Tag 3 packet contains key larger "
+                       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
+                rc = -EINVAL;
+                goto out_free;
+        }
        if (unlikely(data[(*packet_size)++] != 0x04)) {
                printk(KERN_WARNING "Unknown version number [%d]\n",
                       data[(*packet_size) - 1]);
@@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
                rc = -EINVAL;
                goto out;
        }
+        if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
+                printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
+                       "expected size\n");
+                rc = -EINVAL;
+                goto out;
+        }
        if (data[(*packet_size)++] != 0x62) {
                printk(KERN_WARNING "Unrecognizable packet\n");
                rc = -EINVAL;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index fa4c7e7d15d9..12d649602d3a 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/key.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
@@ -120,9 +121,13 @@ static void ecryptfs_put_super(struct super_block *sb)
 {
        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+        lock_kernel();
        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
        ecryptfs_set_superblock_private(sb, NULL);
+        unlock_kernel();
 }
 /**
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 49308a29798a..7ee6f7e3a608 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -5,12 +5,12 @@
 */
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "efs.h"
 static int efs_readdir(struct file *, void *, filldir_t);
 const struct file_operations efs_dir_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = efs_readdir,
 };
@@ -33,8 +33,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
        if (inode->i_size & (EFS_DIRBSIZE-1))
                printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
-        lock_kernel();
        /* work out where this entry can be found */
        block = filp->f_pos >> EFS_DIRBSIZE_BITS;
@@ -107,7 +105,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
        filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
 out:
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index c3fb5f9c4a44..1511bf9e5f80 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -8,7 +8,6 @@
 #include <linux/buffer_head.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/exportfs.h>
 #include "efs.h"
@@ -63,16 +62,12 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
        efs_ino_t inodenum;
        struct inode * inode = NULL;
-        lock_kernel();
        inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
        if (inodenum) {
                inode = efs_iget(dir->i_sb, inodenum);
-                if (IS_ERR(inode)) {
+                if (IS_ERR(inode))
-                        unlock_kernel();
                        return ERR_CAST(inode);
-                }
        }
-        unlock_kernel();
        return d_splice_alias(inode, dentry);
 }
@@ -115,11 +110,9 @@ struct dentry *efs_get_parent(struct dentry *child)
        struct dentry *parent = ERR_PTR(-ENOENT);
        efs_ino_t ino;
-        lock_kernel();
        ino = efs_find_entry(child->d_inode, "..", 2);
        if (ino)
                parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino));
-        unlock_kernel();
        return parent;
 }
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 41911ec83aaf..75117d0dac2b 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -9,7 +9,6 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "efs.h"
 static int efs_symlink_readpage(struct file *file, struct page *page)
@@ -22,9 +21,8 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
  
        err = -ENAMETOOLONG;
        if (size > 2 * EFS_BLOCKSIZE)
-                goto fail_notlocked;
+                goto fail;
  
-        lock_kernel();
        /* read first 512 bytes of link target */
        err = -EIO;
        bh = sb_bread(inode->i_sb, efs_bmap(inode, 0));
@@ -40,14 +38,11 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
                brelse(bh);
        }
        link[size] = '\0';
-        unlock_kernel();
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 fail:
-        unlock_kernel();
-fail_notlocked:
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 2a701d593d35..31d12de83a2a 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -14,34 +14,44 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
-#include <linux/eventfd.h>
 #include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
 struct eventfd_ctx {
+        struct kref kref;
        wait_queue_head_t wqh;
        /*
         * Every time that a write(2) is performed on an eventfd, the
         * value of the __u64 being written is added to "count" and a
         * wakeup is performed on "wqh". A read(2) will return the "count"
         * value to userspace, and will reset "count" to zero. The kernel
-         * size eventfd_signal() also, adds to the "count" counter and
+         * side eventfd_signal() also, adds to the "count" counter and
         * issue a wakeup.
         */
        __u64 count;
        unsigned int flags;
 };
-/*
+/**
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
+ * eventfd_signal - Adds @n to the eventfd counter.
- * success, or a value lower then "n" in case of coutner overflow.
+ * @ctx: [in] Pointer to the eventfd context.
- * This function is supposed to be called by the kernel in paths
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
- * that do not allow sleeping. In this function we allow the counter
+ *          The value cannot be negative.
- * to reach the ULLONG_MAX value, and we signal this as overflow
+ *
- * condition by returining a POLLERR to poll(2).
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns @n in case of success, a non-negative number lower than @n in case
+ * of overflow, or the following error codes:
+ *
+ * -EINVAL    : The value of @n is negative.
 */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
 {
-        struct eventfd_ctx *ctx = file->private_data;
        unsigned long flags;
        if (n < 0)
@@ -56,10 +66,47 @@ int eventfd_signal(struct file *file, int n)
        return n;
 }
+EXPORT_SYMBOL_GPL(eventfd_signal);
+static void eventfd_free(struct kref *kref)
+{
+        struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+        kfree(ctx);
+}
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+        kref_get(&ctx->kref);
+        return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+        kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-        kfree(file->private_data);
+        struct eventfd_ctx *ctx = file->private_data;
+        wake_up_poll(&ctx->wqh, POLLHUP);
+        eventfd_ctx_put(ctx);
        return 0;
 }
@@ -183,6 +230,16 @@ static const struct file_operations eventfd_fops = {
        .write          = eventfd_write,
 };
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the eventfd file structure in case of success, or the
+ * following error pointer:
+ *
+ * -EBADF    : Invalid @fd file descriptor.
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
 struct file *eventfd_fget(int fd)
 {
        struct file *file;
@@ -197,6 +254,49 @@ struct file *eventfd_fget(int fd)
        return file;
 }
+EXPORT_SYMBOL_GPL(eventfd_fget);
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointers returned by the following functions:
+ *
+ * eventfd_fget
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+        struct file *file;
+        struct eventfd_ctx *ctx;
+        file = eventfd_fget(fd);
+        if (IS_ERR(file))
+                return (struct eventfd_ctx *) file;
+        ctx = eventfd_ctx_get(file->private_data);
+        fput(file);
+        return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointer:
+ *
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+        if (file->f_op != &eventfd_fops)
+                return ERR_PTR(-EINVAL);
+        return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
@@ -214,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
        if (!ctx)
                return -ENOMEM;
+        kref_init(&ctx->kref);
        init_waitqueue_head(&ctx->wqh);
        ctx->count = count;
        ctx->flags = flags;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5458e80fc558..085c5c063420 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -98,7 +98,7 @@ struct epoll_filefd {
 struct nested_call_node {
        struct list_head llink;
        void *cookie;
-        int cpu;
+        void *ctx;
 };
 /*
@@ -317,17 +317,17 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
 * @nproc: Nested call core function pointer.
 * @priv: Opaque data to be passed to the @nproc callback.
 * @cookie: Cookie to be used to identify this nested call.
+ * @ctx: This instance context.
 *
 * Returns: Returns the code returned by the @nproc callback, or -1 if
 *          the maximum recursion limit has been exceeded.
 */
 static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
                          int (*nproc)(void *, void *, int), void *priv,
-                          void *cookie)
+                          void *cookie, void *ctx)
 {
        int error, call_nests = 0;
        unsigned long flags;
-        int this_cpu = get_cpu();
        struct list_head *lsthead = &ncalls->tasks_call_list;
        struct nested_call_node *tncur;
        struct nested_call_node tnode;
@@ -340,7 +340,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
         * very much limited.
         */
        list_for_each_entry(tncur, lsthead, llink) {
-                if (tncur->cpu == this_cpu &&
+                if (tncur->ctx == ctx &&
                    (tncur->cookie == cookie || ++call_nests > max_nests)) {
                        /*
                         * Ops ... loop detected or maximum nest level reached.
@@ -352,7 +352,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
        }
        /* Add the current task and cookie to the list */
-        tnode.cpu = this_cpu;
+        tnode.ctx = ctx;
        tnode.cookie = cookie;
        list_add(&tnode.llink, lsthead);
@@ -364,10 +364,9 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
        /* Remove the current task from the list */
        spin_lock_irqsave(&ncalls->lock, flags);
        list_del(&tnode.llink);
- out_unlock:
+out_unlock:
        spin_unlock_irqrestore(&ncalls->lock, flags);
-        put_cpu();
        return error;
 }
@@ -408,8 +407,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
 */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
+        int this_cpu = get_cpu();
        ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
-                       ep_poll_wakeup_proc, NULL, wq);
+                       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
+        put_cpu();
 }
 /*
@@ -663,7 +666,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
         * could re-enter here.
         */
        pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-                                   ep_poll_readyevents_proc, ep, ep);
+                                   ep_poll_readyevents_proc, ep, ep, current);
        return pollflags != -1 ? pollflags : 0;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 895823d0149d..fb4f3cdda78c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/perf_counter.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
@@ -677,8 +678,8 @@ exit:
 }
 EXPORT_SYMBOL(open_exec);
-int kernel_read(struct file *file, unsigned long offset,
+int kernel_read(struct file *file, loff_t offset,
-        char *addr, unsigned long count)
+                char *addr, unsigned long count)
 {
        mm_segment_t old_fs;
        loff_t pos = offset;
@@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
        task_lock(tsk);
        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
        task_unlock(tsk);
+        perf_counter_comm(tsk);
 }
 int flush_old_exec(struct linux_binprm * bprm)
@@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)
        current->personality &= ~bprm->per_clear;
+        /*
+         * Flush performance counters when crossing a
+         * security domain:
+         */
+        if (!get_dumpable(current->mm))
+                perf_counter_exit_task(current);
        /* An exec changes our domain. We are no longer part of the thread
           group */
@@ -1016,7 +1025,7 @@ void install_exec_creds(struct linux_binprm *bprm)
        commit_creds(bprm->cred);
        bprm->cred = NULL;
-        /* cred_exec_mutex must be held at least to this point to prevent
+        /* cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
         * credentials; any time after this it may be unlocked */
@@ -1026,7 +1035,7 @@ EXPORT_SYMBOL(install_exec_creds);
 /*
 * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_exec_mutex to protect against
+ * - the caller must hold current->cred_guard_mutex to protect against
 *   PTRACE_ATTACH
 */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1268,8 +1277,8 @@ int do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+        retval = -ERESTARTNOINTR;
-        if (retval < 0)
+        if (mutex_lock_interruptible(&current->cred_guard_mutex))
                goto out_free;
        current->in_execve = 1;
@@ -1331,7 +1340,7 @@ int do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1354,7 +1363,7 @@ out_unmark:
 out_unlock:
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
 out_free:
        free_bprm(bprm);
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1512c4bb8c7..c6718e4817fe 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -2,9 +2,7 @@
 * common.h - Common definitions for both Kernel and user-mode utilities
 *
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -175,10 +173,4 @@ int exofs_async_op(struct osd_request *or,
 int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
-int osd_req_read_kern(struct osd_request *or,
-        const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
-int osd_req_write_kern(struct osd_request *or,
-        const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
 #endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 65b0c8c776a1..4cfab1cc75c0 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 0fd4c7859679..5ec72e020b22 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -156,6 +154,9 @@ ino_t exofs_parent_ino(struct dentry *child);
 int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
                    struct inode *);
+/* super.c               */
+int exofs_sync_fs(struct super_block *sb, int wait);
 /*********************
 * operation vectors *
 *********************/
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 6ed7fe484752..839b9dc1e70f 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -47,16 +45,23 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
 {
        int ret;
        struct address_space *mapping = filp->f_mapping;
+        struct inode *inode = dentry->d_inode;
+        struct super_block *sb;
        ret = filemap_write_and_wait(mapping);
        if (ret)
                return ret;
-        /*Note: file_fsync below also calles sync_blockdev, which is a no-op
+        /* sync the inode attributes */
-         *      for exofs, but other then that it does sync_inode and
+        ret = write_inode_now(inode, 1);
-         *      sync_superblock which is what we need here.
-         */
+        /* This is a good place to write the sb */
-        return file_fsync(filp, dentry, datasync);
+        /* TODO: Sechedule an sb-sync on create */
+        sb = inode->i_sb;
+        if (sb->s_dirt)
+                exofs_sync_fs(sb, 1);
+        return ret;
 }
 static int exofs_flush(struct file *file, fl_owner_t id)
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ba8d9fab4693..6c10f7476699 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -59,10 +57,9 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
                struct inode *inode)
 {
        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-        struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
        pcol->sbi = sbi;
-        pcol->req_q = req_q;
+        pcol->req_q = osd_request_queue(sbi->s_dev);
        pcol->inode = inode;
        pcol->expected_pages = expected_pages;
@@ -266,7 +263,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
                goto err;
        }
-        osd_req_read(or, &obj, pcol->bio, i_start);
+        osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
        if (is_sync) {
                exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
@@ -296,6 +293,9 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 err:
        if (!is_sync)
                _unlock_pcol_pages(pcol, ret, READ);
+        else /* Pages unlocked by caller in sync mode only free bio */
+                pcol_free(pcol);
        kfree(pcol_copy);
        if (or)
                osd_end_request(or);
@@ -522,7 +522,8 @@ static int write_exec(struct page_collect *pcol)
        *pcol_copy = *pcol;
-        osd_req_write(or, &obj, pcol_copy->bio, i_start);
+        pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
+        osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
        ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
        if (unlikely(ret)) {
                EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 77fdd765e76d..b7dd0c236863 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b249ae97fb15..4372542df284 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -50,10 +48,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
        /* FIXME: should be include in osd_sense_info */
        if (in_resid)
-                *in_resid = or->in.req ? or->in.req->data_len : 0;
+                *in_resid = or->in.req ? or->in.req->resid_len : 0;
        if (out_resid)
-                *out_resid = or->out.req ? or->out.req->data_len : 0;
+                *out_resid = or->out.req ? or->out.req->resid_len : 0;
        return ret;
 }
@@ -125,29 +123,3 @@ int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
        return -EIO;
 }
-int osd_req_read_kern(struct osd_request *or,
-        const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
-{
-        struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
-        struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
-        if (!bio)
-                return -ENOMEM;
-        osd_req_read(or, obj, bio, offset);
-        return 0;
-}
-int osd_req_write_kern(struct osd_request *or,
-        const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
-{
-        struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
-        struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
-        if (!bio)
-                return -ENOMEM;
-        osd_req_write(or, obj, bio, offset);
-        return 0;
-}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f1985e857e2..5ab10c3bbebe 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
@@ -33,6 +31,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/parser.h>
 #include <linux/vfs.h>
@@ -200,20 +199,21 @@ static const struct export_operations exofs_export_ops;
 /*
 * Write the superblock to the OSD
 */
-static void exofs_write_super(struct super_block *sb)
+int exofs_sync_fs(struct super_block *sb, int wait)
 {
        struct exofs_sb_info *sbi;
        struct exofs_fscb *fscb;
        struct osd_request *or;
        struct osd_obj_id obj;
-        int ret;
+        int ret = -ENOMEM;
        fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
        if (!fscb) {
                EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-                return;
+                return -ENOMEM;
        }
+        lock_super(sb);
        lock_kernel();
        sbi = sb->s_fs_info;
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -246,7 +246,17 @@ out:
        if (or)
                osd_end_request(or);
        unlock_kernel();
+        unlock_super(sb);
        kfree(fscb);
+        return ret;
+}
+static void exofs_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                exofs_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 /*
@@ -258,6 +268,11 @@ static void exofs_put_super(struct super_block *sb)
        int num_pend;
        struct exofs_sb_info *sbi = sb->s_fs_info;
+        lock_kernel();
+        if (sb->s_dirt)
+                exofs_write_super(sb);
        /* make sure there are no pending commands */
        for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
             num_pend = atomic_read(&sbi->s_curr_pending)) {
@@ -271,6 +286,8 @@ static void exofs_put_super(struct super_block *sb)
        osduld_put_device(sbi->s_dev);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 /*
@@ -484,6 +501,7 @@ static const struct super_operations exofs_sops = {
        .delete_inode   = exofs_delete_inode,
        .put_super      = exofs_put_super,
        .write_super    = exofs_write_super,
+        .sync_fs        = exofs_sync_fs,
        .statfs         = exofs_statfs,
 };
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
index 36e2d7bc7f7b..4dd687c3e747 100644
--- a/fs/exofs/symlink.c
+++ b/fs/exofs/symlink.c
@@ -1,8 +1,6 @@
 /*
 * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2005, 2006
- * International Business Machines
 * Copyright (C) 2008, 2009
 * Boaz Harrosh <bharrosh@panasas.com>
 *
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index e0b2b43c1fdb..f42af45cfd88 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_EXT2_FS) += ext2.o
-ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
          ioctl.o namei.o super.o symlink.o
 ext2-$(CONFIG_EXT2_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d46e38cb85c5..d636e1297cad 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -125,37 +125,12 @@ fail:
        return ERR_PTR(-EINVAL);
 }
-static inline struct posix_acl *
-ext2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = EXT2_ACL_NOT_CACHED;
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT2_ACL_NOT_CACHED)
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-static inline void
-ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                   struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT2_ACL_NOT_CACHED)
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 /*
 * inode->i_mutex: don't care
 */
 static struct posix_acl *
 ext2_get_acl(struct inode *inode, int type)
 {
-        struct ext2_inode_info *ei = EXT2_I(inode);
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
@@ -164,23 +139,19 @@ ext2_get_acl(struct inode *inode, int type)
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return NULL;
-        switch(type) {
+        acl = get_cached_acl(inode, type);
-                case ACL_TYPE_ACCESS:
+        if (acl != ACL_NOT_CACHED)
-                        acl = ext2_iget_acl(inode, &ei->i_acl);
+                return acl;
-                        if (acl != EXT2_ACL_NOT_CACHED)
-                                return acl;
+        switch (type) {
-                        name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+        case ACL_TYPE_ACCESS:
-                        break;
+                name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                break;
-                case ACL_TYPE_DEFAULT:
+        case ACL_TYPE_DEFAULT:
-                        acl = ext2_iget_acl(inode, &ei->i_default_acl);
+                name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
-                        if (acl != EXT2_ACL_NOT_CACHED)
+                break;
-                                return acl;
+        default:
-                        name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                BUG();
-                        break;
-                default:
-                        return ERR_PTR(-EINVAL);
        }
        retval = ext2_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
@@ -197,17 +168,9 @@ ext2_get_acl(struct inode *inode, int type)
                acl = ERR_PTR(retval);
        kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
-                switch(type) {
+                set_cached_acl(inode, type, acl);
-                        case ACL_TYPE_ACCESS:
-                                ext2_iset_acl(inode, &ei->i_acl, acl);
-                                break;
-                        case ACL_TYPE_DEFAULT:
-                                ext2_iset_acl(inode, &ei->i_default_acl, acl);
-                                break;
-                }
-        }
        return acl;
 }
@@ -217,7 +180,6 @@ ext2_get_acl(struct inode *inode, int type)
 static int
 ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
-        struct ext2_inode_info *ei = EXT2_I(inode);
        int name_index;
        void *value = NULL;
        size_t size = 0;
@@ -263,17 +225,8 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        error = ext2_xattr_set(inode, name_index, "", value, size, 0);
        kfree(value);
-        if (!error) {
+        if (!error)
-                switch(type) {
+                set_cached_acl(inode, type, acl);
-                        case ACL_TYPE_ACCESS:
-                                ext2_iset_acl(inode, &ei->i_acl, acl);
-                                break;
-                        case ACL_TYPE_DEFAULT:
-                                ext2_iset_acl(inode, &ei->i_default_acl, acl);
-                                break;
-                }
-        }
        return error;
 }
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index b42cf578554b..ecefe478898f 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -53,10 +53,6 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
-/* Value for inode->u.ext2_i.i_acl and inode->u.ext2_i.i_default_acl
-   if the ACL has not been cached */
-#define EXT2_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
 extern int ext2_permission (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2999d72153b7..6cde970b0a1a 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -450,7 +450,7 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 /* Releases the page */
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
-                        struct page *page, struct inode *inode)
+                   struct page *page, struct inode *inode, int update_times)
 {
        loff_t pos = page_offset(page) +
                        (char *) de - (char *) page_address(page);
@@ -465,7 +465,8 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
        ext2_set_de_type(de, inode);
        err = ext2_commit_chunk(page, pos, len);
        ext2_put_page(page);
-        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+        if (update_times)
+                dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
        EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
        mark_inode_dirty(dir);
 }
@@ -720,5 +721,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-        .fsync          = ext2_sync_file,
+        .fsync          = simple_fsync,
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3203042b36ef..9a8a8e27a063 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -27,7 +27,7 @@ struct ext2_inode_info {
        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
-         * it is ued for making block allocation decisions - we try to
+         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
@@ -47,10 +47,6 @@ struct ext2_inode_info {
         */
        struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-        struct posix_acl        *i_acl;
-        struct posix_acl        *i_default_acl;
-#endif
        rwlock_t i_meta_lock;
        /*
@@ -111,10 +107,7 @@ extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *,
 extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
 extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
-extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
+extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
-/* fsync.c */
-extern int ext2_sync_file (struct file *, struct dentry *, int);
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, int);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 45ed07122182..2b9e47dc9222 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = {
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,
        .release        = ext2_release_file,
-        .fsync          = ext2_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
 };
@@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = {
        .mmap           = xip_file_mmap,
        .open           = generic_file_open,
        .release        = ext2_release_file,
-        .fsync          = ext2_sync_file,
+        .fsync          = simple_fsync,
 };
 #endif
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
deleted file mode 100644
index fc66c93fcb5c..000000000000
--- a/fs/ext2/fsync.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  linux/fs/ext2/fsync.c
- *
- *  Copyright (C) 1993  Stephen Tweedie (sct@dcs.ed.ac.uk)
- *  from
- *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
- *                      Laboratoire MASI - Institut Blaise Pascal
- *                      Universite Pierre et Marie Curie (Paris VI)
- *  from
- *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
- * 
- *  ext2fs fsync primitive
- *
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- * 
- *  Removed unnecessary code duplication for little endian machines
- *  and excessive __inline__s. 
- *        Andi Kleen, 1997
- *
- * Major simplications and cleanup - we only need to do the metadata, because
- * we can depend on generic_block_fdatasync() to sync the data blocks.
- */
-#include "ext2.h"
-#include <linux/buffer_head.h>          /* for sync_mapping_buffers() */
-/*
- *      File may be NULL when we are called. Perhaps we shouldn't
- *      even pass file to fsync ?
- */
-int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        int ret;
-        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return ret;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return ret;
-        err = ext2_sync_inode(inode);
-        if (ret == 0)
-                ret = err;
-        return ret;
-}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index acf678831103..e27130341d4f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,8 +41,6 @@ MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
-static int ext2_update_inode(struct inode * inode, int do_sync);
 /*
 * Test whether an inode is a fast symlink.
 */
@@ -66,7 +64,7 @@ void ext2_delete_inode (struct inode * inode)
                goto no_delete;
        EXT2_I(inode)->i_dtime  = get_seconds();
        mark_inode_dirty(inode);
-        ext2_update_inode(inode, inode_needs_sync(inode));
+        ext2_write_inode(inode, inode_needs_sync(inode));
        inode->i_size = 0;
        if (inode->i_blocks)
@@ -1226,10 +1224,6 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT2_I(inode);
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-        ei->i_acl = EXT2_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT2_ACL_NOT_CACHED;
-#endif
        ei->i_block_alloc_info = NULL;
        raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
@@ -1337,7 +1331,7 @@ bad_inode:
        return ERR_PTR(ret);
 }
-static int ext2_update_inode(struct inode * inode, int do_sync)
+int ext2_write_inode(struct inode *inode, int do_sync)
 {
        struct ext2_inode_info *ei = EXT2_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -1442,11 +1436,6 @@ static int ext2_update_inode(struct inode * inode, int do_sync)
        return err;
 }
-int ext2_write_inode(struct inode *inode, int wait)
-{
-        return ext2_update_inode(inode, wait);
-}
 int ext2_sync_inode(struct inode *inode)
 {
        struct writeback_control wbc = {
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 7cb4badef927..e7431309bdca 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -13,7 +13,6 @@
 #include <linux/sched.h>
 #include <linux/compat.h>
 #include <linux/mount.h>
-#include <linux/smp_lock.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 90ea17998a73..e1dedb0f7873 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -66,8 +66,16 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
        inode = NULL;
        if (ino) {
                inode = ext2_iget(dir->i_sb, ino);
-                if (IS_ERR(inode))
+                if (unlikely(IS_ERR(inode))) {
-                        return ERR_CAST(inode);
+                        if (PTR_ERR(inode) == -ESTALE) {
+                                ext2_error(dir->i_sb, __func__,
+                                                "deleted inode referenced: %lu",
+                                                ino);
+                                return ERR_PTR(-EIO);
+                        } else {
+                                return ERR_CAST(inode);
+                        }
+                }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -320,7 +328,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                if (!new_de)
                        goto out_dir;
                inode_inc_link_count(old_inode);
-                ext2_set_link(new_dir, new_de, new_page, old_inode);
+                ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
                        drop_nlink(new_inode);
@@ -352,7 +360,8 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
        inode_dec_link_count(old_inode);
        if (dir_de) {
-                ext2_set_link(old_inode, dir_de, dir_page, new_dir);
+                if (old_dir != new_dir)
+                        ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
                inode_dec_link_count(old_dir);
        }
        return 0;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5c4afe652245..1a9ffee47d56 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,7 @@ static void ext2_sync_super(struct super_block *sb,
                            struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext2_sync_fs(struct super_block *sb, int wait);
 void ext2_error (struct super_block * sb, const char * function,
                 const char * fmt, ...)
@@ -114,6 +115,11 @@ static void ext2_put_super (struct super_block * sb)
        int i;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                ext2_write_super(sb);
        ext2_xattr_put_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                struct ext2_super_block *es = sbi->s_es;
@@ -135,7 +141,7 @@ static void ext2_put_super (struct super_block * sb)
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache * ext2_inode_cachep;
@@ -146,10 +152,6 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
        ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-        ei->i_acl = EXT2_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT2_ACL_NOT_CACHED;
-#endif
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
        return &ei->vfs_inode;
@@ -192,18 +194,6 @@ static void destroy_inodecache(void)
 static void ext2_clear_inode(struct inode *inode)
 {
        struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-        struct ext2_inode_info *ei = EXT2_I(inode);
-        if (ei->i_acl && ei->i_acl != EXT2_ACL_NOT_CACHED) {
-                posix_acl_release(ei->i_acl);
-                ei->i_acl = EXT2_ACL_NOT_CACHED;
-        }
-        if (ei->i_default_acl && ei->i_default_acl != EXT2_ACL_NOT_CACHED) {
-                posix_acl_release(ei->i_default_acl);
-                ei->i_default_acl = EXT2_ACL_NOT_CACHED;
-        }
-#endif
        ext2_discard_reservation(inode);
        EXT2_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
@@ -304,6 +294,7 @@ static const struct super_operations ext2_sops = {
        .delete_inode   = ext2_delete_inode,
        .put_super      = ext2_put_super,
        .write_super    = ext2_write_super,
+        .sync_fs        = ext2_sync_fs,
        .statfs         = ext2_statfs,
        .remount_fs     = ext2_remount,
        .clear_inode    = ext2_clear_inode,
@@ -1093,6 +1084,7 @@ failed_mount:
        brelse(bh);
 failed_sbi:
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return ret;
 }
@@ -1126,25 +1118,36 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 * set s_state to EXT2_VALID_FS after some corrections.
 */
-void ext2_write_super (struct super_block * sb)
+static int ext2_sync_fs(struct super_block *sb, int wait)
 {
-        struct ext2_super_block * es;
+        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
        lock_kernel();
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
-                es = EXT2_SB(sb)->s_es;
+                ext2_debug("setting valid to 0\n");
+                es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-                if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
+                es->s_free_blocks_count =
-                        ext2_debug ("setting valid to 0\n");
+                        cpu_to_le32(ext2_count_free_blocks(sb));
-                        es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
+                es->s_free_inodes_count =
-                        es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
+                        cpu_to_le32(ext2_count_free_inodes(sb));
-                        es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
+                es->s_mtime = cpu_to_le32(get_seconds());
-                        es->s_mtime = cpu_to_le32(get_seconds());
+                ext2_sync_super(sb, es);
-                        ext2_sync_super(sb, es);
+        } else {
-                } else
+                ext2_commit_super(sb, es);
-                        ext2_commit_super (sb, es);
        }
        sb->s_dirt = 0;
        unlock_kernel();
+        return 0;
+}
+void ext2_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                ext2_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static int ext2_remount (struct super_block * sb, int * flags, char * data)
@@ -1156,6 +1159,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        unsigned long old_sb_flags;
        int err;
+        lock_kernel();
        /* Store the old options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -1191,12 +1196,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+                unlock_kernel();
                return 0;
+        }
        if (*flags & MS_RDONLY) {
                if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
-                    !(sbi->s_mount_state & EXT2_VALID_FS))
+                    !(sbi->s_mount_state & EXT2_VALID_FS)) {
+                        unlock_kernel();
                        return 0;
+                }
                /*
                 * OK, we are remounting a valid rw partition rdonly, so set
                 * the rdonly flag and then mark the partition as valid again.
@@ -1223,12 +1232,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                        sb->s_flags &= ~MS_RDONLY;
        }
        ext2_sync_super(sb, es);
+        unlock_kernel();
        return 0;
 restore_opts:
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sb->s_flags = old_sb_flags;
+        unlock_kernel();
        return err;
 }
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index fb3c1a21b135..522b15498f45 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -29,23 +29,25 @@ config EXT3_FS
          module will be called ext3.
 config EXT3_DEFAULTS_TO_ORDERED
-        bool "Default to 'data=ordered' in ext3 (legacy option)"
+        bool "Default to 'data=ordered' in ext3"
        depends on EXT3_FS
        help
-          If a filesystem does not explicitly specify a data ordering
+          The journal mode options for ext3 have different tradeoffs
-          mode, and the journal capability allowed it, ext3 used to
+          between when data is guaranteed to be on disk and
-          historically default to 'data=ordered'.
+          performance.  The use of "data=writeback" can cause
+          unwritten data to appear in files after an system crash or
-          That was a rather unfortunate choice, because it leads to all
+          power failure, which can be a security issue.  However,
-          kinds of latency problems, and the 'data=writeback' mode is more
+          "data=ordered" mode can also result in major performance
-          appropriate these days.
+          problems, including seconds-long delays before an fsync()
+          call returns.  For details, see:
-          You should probably always answer 'n' here, and if you really
-          want to use 'data=ordered' mode, set it in the filesystem itself
+          http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
-          with 'tune2fs -o journal_data_ordered'.
+          If you have been historically happy with ext3's performance,
-          But if you really want to enable the legacy default, you can do
+          data=ordered mode will be a safe choice and you should
-          so by answering 'y' to this question.
+          answer 'y' here.  If you understand the reliability and data
+          privacy issues of data=writeback and are willing to make
+          that trade off, answer 'n'.
 config EXT3_FS_XATTR
        bool "Ext3 extended attributes"
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index d81ef2fdb08e..e167bae37ef0 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -126,30 +126,6 @@ fail:
        return ERR_PTR(-EINVAL);
 }
-static inline struct posix_acl *
-ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT3_ACL_NOT_CACHED)
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-static inline void
-ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                  struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT3_ACL_NOT_CACHED)
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 /*
 * Inode operation get_posix_acl().
 *
@@ -158,7 +134,6 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 static struct posix_acl *
 ext3_get_acl(struct inode *inode, int type)
 {
-        struct ext3_inode_info *ei = EXT3_I(inode);
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
@@ -167,24 +142,21 @@ ext3_get_acl(struct inode *inode, int type)
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return NULL;
-        switch(type) {
+        acl = get_cached_acl(inode, type);
-                case ACL_TYPE_ACCESS:
+        if (acl != ACL_NOT_CACHED)
-                        acl = ext3_iget_acl(inode, &ei->i_acl);
+                return acl;
-                        if (acl != EXT3_ACL_NOT_CACHED)
-                                return acl;
+        switch (type) {
-                        name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+        case ACL_TYPE_ACCESS:
-                        break;
+                name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
+                break;
-                case ACL_TYPE_DEFAULT:
+        case ACL_TYPE_DEFAULT:
-                        acl = ext3_iget_acl(inode, &ei->i_default_acl);
+                name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
-                        if (acl != EXT3_ACL_NOT_CACHED)
+                break;
-                                return acl;
+        default:
-                        name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                BUG();
-                        break;
-                default:
-                        return ERR_PTR(-EINVAL);
        }
        retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
                value = kmalloc(retval, GFP_NOFS);
@@ -200,17 +172,9 @@ ext3_get_acl(struct inode *inode, int type)
                acl = ERR_PTR(retval);
        kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
-                switch(type) {
+                set_cached_acl(inode, type, acl);
-                        case ACL_TYPE_ACCESS:
-                                ext3_iset_acl(inode, &ei->i_acl, acl);
-                                break;
-                        case ACL_TYPE_DEFAULT:
-                                ext3_iset_acl(inode, &ei->i_default_acl, acl);
-                                break;
-                }
-        }
        return acl;
 }
@@ -223,7 +187,6 @@ static int
 ext3_set_acl(handle_t *handle, struct inode *inode, int type,
             struct posix_acl *acl)
 {
-        struct ext3_inode_info *ei = EXT3_I(inode);
        int name_index;
        void *value = NULL;
        size_t size = 0;
@@ -268,17 +231,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
                                      value, size, 0);
        kfree(value);
-        if (!error) {
-                switch(type) {
-                        case ACL_TYPE_ACCESS:
-                                ext3_iset_acl(inode, &ei->i_acl, acl);
-                                break;
-                        case ACL_TYPE_DEFAULT:
+        if (!error)
-                                ext3_iset_acl(inode, &ei->i_default_acl, acl);
+                set_cached_acl(inode, type, acl);
-                                break;
-                }
-        }
        return error;
 }
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 42da16b8cac0..07d15a3a5969 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -53,10 +53,6 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl
-   if the ACL has not been cached */
-#define EXT3_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
 extern int ext3_permission (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 225202db8974..27967f92e820 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -649,7 +649,7 @@ do_more:
                count = overflow;
                goto do_more;
        }
-        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
        ext3_std_error(sb, err);
@@ -1708,7 +1708,6 @@ allocated:
        if (!fatal)
                fatal = err;
-        sb->s_dirt = 1;
        if (fatal)
                goto out;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882f..373fa90c796a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext3_get_blocks_handle(NULL, inode, blk, 1,
+                err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
-                                                &map_bh, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dd13d60d524b..b39991285136 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
        err = ext3_journal_dirty_metadata(handle, bitmap_bh);
        if (!fatal)
                fatal = err;
-        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
        ext3_std_error(sb, fatal);
@@ -537,7 +537,6 @@ got:
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);
-        sb->s_dirt = 1;
        inode->i_uid = current_fsuid();
        if (test_opt (sb, GRPID))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index fcfa24361856..b49908a167ae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -788,7 +788,7 @@ err_out:
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
                sector_t iblock, unsigned long maxblocks,
                struct buffer_head *bh_result,
-                int create, int extend_disksize)
+                int create)
 {
        int err = -EIO;
        int offsets[4];
@@ -820,7 +820,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
                while (count < maxblocks && count <= blocks_to_boundary) {
                        ext3_fsblk_t blk;
-                        if (!verify_chain(chain, partial)) {
+                        if (!verify_chain(chain, chain + depth - 1)) {
                                /*
                                 * Indirect block might be removed by
                                 * truncate while we were reading it.
@@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext3_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
-        /*
-         * i_disksize growing is protected by truncate_mutex.  Don't forget to
-         * protect it if you're about to implement concurrent
-         * ext3_get_block() -bzzz
-        */
-        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-                ei->i_disksize = inode->i_size;
        mutex_unlock(&ei->truncate_mutex);
        if (err)
                goto cleanup;
@@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
        }
        ret = ext3_get_blocks_handle(handle, inode, iblock,
-                                        max_blocks, bh_result, create, 0);
+                                        max_blocks, bh_result, create);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
        err = ext3_get_blocks_handle(handle, inode, block, 1,
-                                        &dummy, create, 1);
+                                        &dummy, create);
        /*
         * ext3_get_blocks_handle() returns number of blocks
         * mapped. 0 in case of a HOLE.
@@ -1193,15 +1186,16 @@ write_begin_failed:
                 * i_size_read because we hold i_mutex.
                 *
                 * Add inode to orphan list in case we crash before truncate
-                 * finishes.
+                 * finishes. Do this only if ext3_can_truncate() agrees so
+                 * that orphan processing code is happy.
                 */
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext3_can_truncate(inode))
                        ext3_orphan_add(handle, inode);
                ext3_journal_stop(handle);
                unlock_page(page);
                page_cache_release(page);
                if (pos + len > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
+                        ext3_truncate(inode);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -1287,7 +1281,7 @@ static int ext3_ordered_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret2 = ext3_journal_stop(handle);
        if (!ret)
@@ -1296,7 +1290,7 @@ static int ext3_ordered_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1315,14 +1309,14 @@ static int ext3_writeback_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret = ext3_journal_stop(handle);
        unlock_page(page);
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1358,7 +1352,7 @@ static int ext3_journalled_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
        if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1369,7 @@ static int ext3_journalled_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -2374,7 +2368,7 @@ void ext3_truncate(struct inode *inode)
        struct page *page;
        if (!ext3_can_truncate(inode))
-                return;
+                goto out_notrans;
        if (inode->i_size == 0 && ext3_should_writeback_data(inode))
                ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
@@ -2390,7 +2384,7 @@ void ext3_truncate(struct inode *inode)
                page = grab_cache_page(mapping,
                                inode->i_size >> PAGE_CACHE_SHIFT);
                if (!page)
-                        return;
+                        goto out_notrans;
        }
        handle = start_transaction(inode);
@@ -2401,7 +2395,7 @@ void ext3_truncate(struct inode *inode)
                        unlock_page(page);
                        page_cache_release(page);
                }
-                return;         /* AKPM: return what? */
+                goto out_notrans;
        }
        last_block = (inode->i_size + blocksize-1)
@@ -2525,6 +2519,14 @@ out_stop:
                ext3_orphan_del(handle, inode);
        ext3_journal_stop(handle);
+        return;
+out_notrans:
+        /*
+         * Delete the inode from orphan list so that it doesn't stay there
+         * forever and trigger assertion on umount.
+         */
+        if (inode->i_nlink)
+                ext3_orphan_del(NULL, inode);
 }
 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -2744,10 +2746,6 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT3_I(inode);
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-        ei->i_acl = EXT3_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
        ei->i_block_alloc_info = NULL;
        ret = __ext3_get_inode_loc(inode, &iloc, 0);
@@ -2960,7 +2958,6 @@ static int ext3_do_update_inode(handle_t *handle,
                                ext3_update_dynamic_rev(sb);
                                EXT3_SET_RO_COMPAT_FEATURE(sb,
                                        EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
-                                sb->s_dirt = 1;
                                handle->h_sync = 1;
                                err = ext3_journal_dirty_metadata(handle,
                                                EXT3_SB(sb)->s_sbh);
@@ -3123,12 +3120,6 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
        rc = inode_setattr(inode, attr);
-        /* If inode_setattr's call to ext3_truncate failed to get a
-         * transaction handle at all, we need to clean up the in-core
-         * orphan list manually. */
-        if (inode->i_nlink)
-                ext3_orphan_del(NULL, inode);
        if (!rc && (ia_valid & ATTR_MODE))
                rc = ext3_acl_chmod(inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 78fdf3836370..8359e7b3dc89 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -934,7 +934,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
                           EXT3_INODES_PER_GROUP(sb));
        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
-        sb->s_dirt = 1;
 exit_journal:
        unlock_super(sb);
@@ -991,7 +990,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
                        ext3_warning(sb, __func__,
-                        "CONFIG_LBD not enabled\n");
+                        "CONFIG_LBDAF not enabled\n");
                return -EINVAL;
        }
@@ -1066,7 +1065,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-        sb->s_dirt = 1;
        unlock_super(sb);
        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
                   o_blocks_count + add);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c3..a8d80a7f1105 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -67,7 +67,6 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext3_unfreeze(struct super_block *sb);
-static void ext3_write_super (struct super_block * sb);
 static int ext3_freeze(struct super_block *sb);
 /*
@@ -399,6 +398,8 @@ static void ext3_put_super (struct super_block * sb)
        struct ext3_super_block *es = sbi->s_es;
        int i, err;
+        lock_kernel();
        ext3_xattr_put_super(sb);
        err = journal_destroy(sbi->s_journal);
        sbi->s_journal = NULL;
@@ -447,7 +448,8 @@ static void ext3_put_super (struct super_block * sb)
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache *ext3_inode_cachep;
@@ -462,10 +464,6 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
        ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-        ei->i_acl = EXT3_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
        return &ei->vfs_inode;
@@ -516,18 +514,6 @@ static void destroy_inodecache(void)
 static void ext3_clear_inode(struct inode *inode)
 {
        struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-        if (EXT3_I(inode)->i_acl &&
-                        EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
-                posix_acl_release(EXT3_I(inode)->i_acl);
-                EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
-        }
-        if (EXT3_I(inode)->i_default_acl &&
-                        EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
-                posix_acl_release(EXT3_I(inode)->i_default_acl);
-                EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
-        }
-#endif
        ext3_discard_reservation(inode);
        EXT3_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
@@ -557,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
 #endif
 }
+static char *data_mode_string(unsigned long mode)
+{
+        switch (mode) {
+        case EXT3_MOUNT_JOURNAL_DATA:
+                return "journal";
+        case EXT3_MOUNT_ORDERED_DATA:
+                return "ordered";
+        case EXT3_MOUNT_WRITEBACK_DATA:
+                return "writeback";
+        }
+        return "unknown";
+}
 /*
 * Show an option if
 *  - it's set to a non-default value OR
@@ -630,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
-        if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+        seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
-                seq_puts(seq, ",data=journal");
+                                                     EXT3_MOUNT_DATA_FLAGS));
-        else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
-                seq_puts(seq, ",data=ordered");
-        else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
-                seq_puts(seq, ",data=writeback");
        if (test_opt(sb, DATA_ERR_ABORT))
                seq_puts(seq, ",data_err=abort");
@@ -761,7 +755,6 @@ static const struct super_operations ext3_sops = {
        .dirty_inode    = ext3_dirty_inode,
        .delete_inode   = ext3_delete_inode,
        .put_super      = ext3_put_super,
-        .write_super    = ext3_write_super,
        .sync_fs        = ext3_sync_fs,
        .freeze_fs      = ext3_freeze,
        .unfreeze_fs    = ext3_unfreeze,
@@ -1039,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb,
                datacheck:
                        if (is_remount) {
                                if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
-                                                != data_opt) {
+                                                == data_opt)
-                                        printk(KERN_ERR
+                                        break;
-                                                "EXT3-fs: cannot change data "
+                                printk(KERN_ERR
-                                                "mode on remount\n");
+                                        "EXT3-fs (device %s): Cannot change "
-                                        return 0;
+                                        "data mode on remount. The filesystem "
-                                }
+                                        "is mounted in data=%s mode and you "
+                                        "try to remount it in data=%s mode.\n",
+                                        sb->s_id,
+                                        data_mode_string(sbi->s_mount_opt &
+                                                        EXT3_MOUNT_DATA_FLAGS),
+                                        data_mode_string(data_opt));
+                                return 0;
                        } else {
                                sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
                                sbi->s_mount_opt |= data_opt;
@@ -1696,7 +1695,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        hblock = bdev_hardsect_size(sb->s_bdev);
+        hblock = bdev_logical_block_size(sb->s_bdev);
        if (sb->s_blocksize != blocksize) {
                /*
                 * Make sure the blocksize for the filesystem is larger
@@ -1785,7 +1784,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 #else
                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
-                sb->s_dirt = 1;
        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
@@ -1812,7 +1810,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
                        " too large to mount safely\n", sb->s_id);
                if (sizeof(sector_t) < 8)
-                        printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
+                        printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not "
                                        "enabled\n");
                goto failed_mount;
        }
@@ -2021,6 +2019,7 @@ failed_mount:
        brelse(bh);
 out_fail:
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
        return ret;
@@ -2119,7 +2118,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        }
        blocksize = sb->s_blocksize;
-        hblock = bdev_hardsect_size(bdev);
+        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
                printk(KERN_ERR
                        "EXT3-fs: blocksize too small for journal device.\n");
@@ -2264,7 +2263,6 @@ static int ext3_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
-                sb->s_dirt = 1;
                /* Make sure we flush the recovery flag to disk. */
                ext3_commit_super(sb, es, 1);
@@ -2307,7 +2305,6 @@ static int ext3_create_journal(struct super_block * sb,
        EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
        es->s_journal_inum = cpu_to_le32(journal_inum);
-        sb->s_dirt = 1;
        /* Make sure we flush the recovery flag to disk. */
        ext3_commit_super(sb, es, 1);
@@ -2353,7 +2350,6 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
        if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-                sb->s_dirt = 0;
                ext3_commit_super(sb, es, 1);
        }
        unlock_super(sb);
@@ -2412,29 +2408,14 @@ int ext3_force_commit(struct super_block *sb)
                return 0;
        journal = EXT3_SB(sb)->s_journal;
-        sb->s_dirt = 0;
        ret = ext3_journal_force_commit(journal);
        return ret;
 }
-/*
- * Ext3 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
- */
-static void ext3_write_super (struct super_block * sb)
-{
-        if (mutex_trylock(&sb->s_lock) != 0)
-                BUG();
-        sb->s_dirt = 0;
-}
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
        tid_t target;
-        sb->s_dirt = 0;
        if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
                if (wait)
                        log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2450,7 +2431,6 @@ static int ext3_freeze(struct super_block *sb)
 {
        int error = 0;
        journal_t *journal;
-        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY)) {
                journal = EXT3_SB(sb)->s_journal;
@@ -2508,7 +2488,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        int i;
 #endif
+        lock_kernel();
        /* Store the original options */
+        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_resuid = sbi->s_resuid;
@@ -2616,6 +2599,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
                        kfree(old_opts.s_qf_names[i]);
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
@@ -2632,6 +2617,8 @@ restore_opts:
                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
        }
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return err;
 }
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 83b7be849bd5..545e37c4b91e 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -463,7 +463,6 @@ static void ext3_xattr_update_super_block(handle_t *handle,
        if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
                EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
-                sb->s_dirt = 1;
                ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        }
 }
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a8ff003a00f7..8867b2a1e5fe 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,8 +5,8 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-                   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                   ext4_jbd2.o migrate.o mballoc.o
+                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
 ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 647e0d65a284..f6d8967149ca 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -126,30 +126,6 @@ fail:
        return ERR_PTR(-EINVAL);
 }
-static inline struct posix_acl *
-ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT4_ACL_NOT_CACHED)
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-static inline void
-ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != EXT4_ACL_NOT_CACHED)
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 /*
 * Inode operation get_posix_acl().
 *
@@ -158,7 +134,6 @@ ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 static struct posix_acl *
 ext4_get_acl(struct inode *inode, int type)
 {
-        struct ext4_inode_info *ei = EXT4_I(inode);
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
@@ -167,23 +142,19 @@ ext4_get_acl(struct inode *inode, int type)
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return NULL;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
-                acl = ext4_iget_acl(inode, &ei->i_acl);
-                if (acl != EXT4_ACL_NOT_CACHED)
-                        return acl;
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
                break;
        case ACL_TYPE_DEFAULT:
-                acl = ext4_iget_acl(inode, &ei->i_default_acl);
-                if (acl != EXT4_ACL_NOT_CACHED)
-                        return acl;
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
                break;
        default:
-                return ERR_PTR(-EINVAL);
+                BUG();
        }
        retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
@@ -200,17 +171,9 @@ ext4_get_acl(struct inode *inode, int type)
                acl = ERR_PTR(retval);
        kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
-                switch (type) {
+                set_cached_acl(inode, type, acl);
-                case ACL_TYPE_ACCESS:
-                        ext4_iset_acl(inode, &ei->i_acl, acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        ext4_iset_acl(inode, &ei->i_default_acl, acl);
-                        break;
-                }
-        }
        return acl;
 }
@@ -223,7 +186,6 @@ static int
 ext4_set_acl(handle_t *handle, struct inode *inode, int type,
             struct posix_acl *acl)
 {
-        struct ext4_inode_info *ei = EXT4_I(inode);
        int name_index;
        void *value = NULL;
        size_t size = 0;
@@ -268,17 +230,9 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
                                      value, size, 0);
        kfree(value);
-        if (!error) {
+        if (!error)
-                switch (type) {
+                set_cached_acl(inode, type, acl);
-                case ACL_TYPE_ACCESS:
-                        ext4_iset_acl(inode, &ei->i_acl, acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        ext4_iset_acl(inode, &ei->i_default_acl, acl);
-                        break;
-                }
-        }
        return error;
 }
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cb45257a246e..949789d2bba6 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -53,10 +53,6 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
-   if the ACL has not been cached */
-#define EXT4_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
 extern int ext4_permission(struct inode *, int);
 extern int ext4_acl_chmod(struct inode *);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad85877..e2126d70dff5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -19,7 +19,6 @@
 #include <linux/buffer_head.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "group.h"
 #include "mballoc.h"
 /*
@@ -88,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
        int bit, bit_max;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        unsigned free_blocks, group_blocks;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -123,7 +123,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                bit_max += ext4_bg_num_gdb(sb, block_group);
        }
-        if (block_group == sbi->s_groups_count - 1) {
+        if (block_group == ngroups - 1) {
                /*
                 * Even though mke2fs always initialize first and last group
                 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +131,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 */
                group_blocks = ext4_blocks_count(sbi->s_es) -
                        le32_to_cpu(sbi->s_es->s_first_data_block) -
-                        (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+                        (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
        } else {
                group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
        }
@@ -205,18 +205,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 {
        unsigned int group_desc;
        unsigned int offset;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (block_group >= sbi->s_groups_count) {
+        if (block_group >= ngroups) {
                ext4_error(sb, "ext4_get_group_desc",
                           "block_group >= groups_count - "
                           "block_group = %u, groups_count = %u",
-                           block_group, sbi->s_groups_count);
+                           block_group, ngroups);
                return NULL;
        }
-        smp_rmb();
        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
-        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
-        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        down_write(&grp->alloc_sem);
        for (i = 0, blocks_freed = 0; i < count; i++) {
                BUFFER_TRACE(bitmap_bh, "clear bit");
-                if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+                if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
                                                bit + i, bitmap_bh->b_data)) {
                        ext4_error(sb, __func__,
                                   "bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                        blocks_freed++;
                }
        }
-        spin_lock(sb_bgl_lock(sbi, block_group));
+        ext4_lock_group(sb, block_group);
        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
        ext4_free_blks_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-        spin_unlock(sb_bgl_lock(sbi, block_group));
+        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
        if (sbi->s_log_groups_per_flex) {
@@ -665,7 +665,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
-        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
@@ -677,7 +677,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        bitmap_count = 0;
        gdp = NULL;
-        smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
@@ -700,7 +699,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        return bitmap_count;
 #else
        desc_count = 0;
-        smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 000000000000..50784ef07563
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,244 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4.h"
+struct ext4_system_zone {
+        struct rb_node  node;
+        ext4_fsblk_t    start_blk;
+        unsigned int    count;
+};
+static struct kmem_cache *ext4_system_zone_cachep;
+int __init init_ext4_system_zone(void)
+{
+        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+                                             SLAB_RECLAIM_ACCOUNT);
+        if (ext4_system_zone_cachep == NULL)
+                return -ENOMEM;
+        return 0;
+}
+void exit_ext4_system_zone(void)
+{
+        kmem_cache_destroy(ext4_system_zone_cachep);
+}
+static inline int can_merge(struct ext4_system_zone *entry1,
+                     struct ext4_system_zone *entry2)
+{
+        if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+                return 1;
+        return 0;
+}
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+                           ext4_fsblk_t start_blk,
+                           unsigned int count)
+{
+        struct ext4_system_zone *new_entry = NULL, *entry;
+        struct rb_node **n = &sbi->system_blks.rb_node, *node;
+        struct rb_node *parent = NULL, *new_node = NULL;
+        while (*n) {
+                parent = *n;
+                entry = rb_entry(parent, struct ext4_system_zone, node);
+                if (start_blk < entry->start_blk)
+                        n = &(*n)->rb_left;
+                else if (start_blk >= (entry->start_blk + entry->count))
+                        n = &(*n)->rb_right;
+                else {
+                        if (start_blk + count > (entry->start_blk + 
+                                                 entry->count))
+                                entry->count = (start_blk + count - 
+                                                entry->start_blk);
+                        new_node = *n;
+                        new_entry = rb_entry(new_node, struct ext4_system_zone,
+                                             node);
+                        break;
+                }
+        }
+        if (!new_entry) {
+                new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+                                             GFP_KERNEL);
+                if (!new_entry)
+                        return -ENOMEM;
+                new_entry->start_blk = start_blk;
+                new_entry->count = count;
+                new_node = &new_entry->node;
+                rb_link_node(new_node, parent, n);
+                rb_insert_color(new_node, &sbi->system_blks);
+        }
+        /* Can we merge to the left? */
+        node = rb_prev(new_node);
+        if (node) {
+                entry = rb_entry(node, struct ext4_system_zone, node);
+                if (can_merge(entry, new_entry)) {
+                        new_entry->start_blk = entry->start_blk;
+                        new_entry->count += entry->count;
+                        rb_erase(node, &sbi->system_blks);
+                        kmem_cache_free(ext4_system_zone_cachep, entry);
+                }
+        }
+        /* Can we merge to the right? */
+        node = rb_next(new_node);
+        if (node) {
+                entry = rb_entry(node, struct ext4_system_zone, node);
+                if (can_merge(new_entry, entry)) {
+                        new_entry->count += entry->count;
+                        rb_erase(node, &sbi->system_blks);
+                        kmem_cache_free(ext4_system_zone_cachep, entry);
+                }
+        }
+        return 0;
+}
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+        struct rb_node *node;
+        struct ext4_system_zone *entry;
+        int first = 1;
+        printk(KERN_INFO "System zones: ");
+        node = rb_first(&sbi->system_blks);
+        while (node) {
+                entry = rb_entry(node, struct ext4_system_zone, node);
+                printk("%s%llu-%llu", first ? "" : ", ",
+                       entry->start_blk, entry->start_blk + entry->count - 1);
+                first = 0;
+                node = rb_next(node);
+        }
+        printk("\n");
+}
+int ext4_setup_system_zone(struct super_block *sb)
+{
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp;
+        ext4_group_t i;
+        int flex_size = ext4_flex_bg_size(sbi);
+        int ret;
+        if (!test_opt(sb, BLOCK_VALIDITY)) {
+                if (EXT4_SB(sb)->system_blks.rb_node)
+                        ext4_release_system_zone(sb);
+                return 0;
+        }
+        if (EXT4_SB(sb)->system_blks.rb_node)
+                return 0;
+        for (i=0; i < ngroups; i++) {
+                if (ext4_bg_has_super(sb, i) &&
+                    ((i < 5) || ((i % flex_size) == 0)))
+                        add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+                                        sbi->s_gdb_count + 1);
+                gdp = ext4_get_group_desc(sb, i, NULL);
+                ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+                if (ret)
+                        return ret;
+                ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+                if (ret)
+                        return ret;
+                ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+                                sbi->s_itb_per_group);
+                if (ret)
+                        return ret;
+        }
+        if (test_opt(sb, DEBUG))
+                debug_print_tree(EXT4_SB(sb));
+        return 0;
+}
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+        struct rb_node  *n = EXT4_SB(sb)->system_blks.rb_node;
+        struct rb_node  *parent;
+        struct ext4_system_zone *entry;
+        while (n) {
+                /* Do the node's children first */
+                if (n->rb_left) {
+                        n = n->rb_left;
+                        continue;
+                }
+                if (n->rb_right) {
+                        n = n->rb_right;
+                        continue;
+                }
+                /*
+                 * The node has no children; free it, and then zero
+                 * out parent's link to it.  Finally go to the
+                 * beginning of the loop and try to free the parent
+                 * node.
+                 */
+                parent = rb_parent(n);
+                entry = rb_entry(n, struct ext4_system_zone, node);
+                kmem_cache_free(ext4_system_zone_cachep, entry);
+                if (!parent)
+                        EXT4_SB(sb)->system_blks.rb_node = NULL;
+                else if (parent->rb_left == n)
+                        parent->rb_left = NULL;
+                else if (parent->rb_right == n)
+                        parent->rb_right = NULL;
+                n = parent;
+        }
+        EXT4_SB(sb)->system_blks.rb_node = NULL;
+}
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+                          unsigned int count)
+{
+        struct ext4_system_zone *entry;
+        struct rb_node *n = sbi->system_blks.rb_node;
+        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+            (start_blk + count > ext4_blocks_count(sbi->s_es)))
+                return 0;
+        while (n) {
+                entry = rb_entry(n, struct ext4_system_zone, node);
+                if (start_blk + count - 1 < entry->start_blk)
+                        n = n->rb_left;
+                else if (start_blk >= (entry->start_blk + entry->count))
+                        n = n->rb_right;
+                else
+                        return 0;
+        }
+        return 1;
+}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b64789929a65..9dc93168e262 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+                err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
-                                                0, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d0f15ef56de1..9714db393efe 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -21,7 +21,14 @@
 #include <linux/magic.h>
 #include <linux/jbd2.h>
 #include <linux/quota.h>
-#include "ext4_i.h"
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
 /*
 * The fourth extended filesystem constants/structures
@@ -46,6 +53,19 @@
 #define ext4_debug(f, a...)     do {} while (0)
 #endif
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE              1
 /* blocks already reserved */
@@ -73,20 +93,20 @@
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
        struct inode *inode;
+        /* how many blocks we want to allocate */
+        unsigned int len;
        /* logical block in target inode */
        ext4_lblk_t logical;
-        /* phys. target (a hint) */
-        ext4_fsblk_t goal;
        /* the closest logical allocated block to the left */
        ext4_lblk_t lleft;
-        /* phys. block for ^^^ */
-        ext4_fsblk_t pleft;
        /* the closest logical allocated block to the right */
        ext4_lblk_t lright;
-        /* phys. block for ^^^ */
+        /* phys. target (a hint) */
+        ext4_fsblk_t goal;
+        /* phys. block for the closest logical allocated block to the left */
+        ext4_fsblk_t pleft;
+        /* phys. block for the closest logical allocated block to the right */
        ext4_fsblk_t pright;
-        /* how many blocks we want to allocate */
-        unsigned int len;
        /* flags. see above EXT4_MB_HINT_* */
        unsigned int flags;
 };
@@ -179,9 +199,6 @@ struct flex_groups {
 #define EXT4_BG_BLOCK_UNINIT    0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED    0x0004 /* On-disk itable initialized to zero */
-#ifdef __KERNEL__
-#include "ext4_sb.h"
-#endif
 /*
 * Macro-instructions used to manage group descriptors
 */
@@ -297,10 +314,23 @@ struct ext4_new_group_data {
 };
 /*
- * Following is used by preallocation code to tell get_blocks() that we
+ * Flags used by ext4_get_blocks()
- * want uninitialzed extents.
 */
-#define EXT4_CREATE_UNINITIALIZED_EXT           2
+        /* Allocate any needed blocks and/or convert an unitialized
+           extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE                  0x0001
+        /* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT              0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT       (EXT4_GET_BLOCKS_UNINIT_EXT|\
+                                                 EXT4_GET_BLOCKS_CREATE)
+        /* Caller is from the delayed allocation writeout path,
+           so set the magic i_delalloc_reserve_flag after taking the 
+           inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
+        /* Call ext4_da_update_reserve_space() after successfully 
+           allocating the blocks */
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE    0x0008
 /*
 * ioctl commands
@@ -322,6 +352,7 @@ struct ext4_new_group_data {
 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
+#define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
 /*
 * ioctl commands in 32 bit emulation
@@ -417,6 +448,15 @@ struct ext4_inode {
        __le32  i_version_hi;   /* high 32 bits for 64-bit version */
 };
+struct move_extent {
+        __u32 reserved;         /* should be zero */
+        __u32 donor_fd;         /* donor file descriptor */
+        __u64 orig_start;       /* logical start offset in block for orig */
+        __u64 donor_start;      /* logical start offset in block for donor */
+        __u64 len;              /* block length to be moved */
+        __u64 moved_len;        /* moved block length */
+};
+#define MAX_DEFRAG_SIZE         ((1UL<<31) - 1)
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -516,6 +556,106 @@ do {									       \
 #endif /* defined(__KERNEL__) || defined(__linux__) */
 /*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+        ext4_fsblk_t    ec_start;
+        ext4_lblk_t     ec_block;
+        __u32           ec_len; /* must be 32bit to return holes */
+        __u32           ec_type;
+};
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+        __le32  i_data[15];     /* unconverted */
+        __u32   i_flags;
+        ext4_fsblk_t    i_file_acl;
+        __u32   i_dtime;
+        /*
+         * i_block_group is the number of the block group which contains
+         * this file's inode.  Constant across the lifetime of the inode,
+         * it is ued for making block allocation decisions - we try to
+         * place a file's data blocks near its inode block, and new inodes
+         * near to their parent directory's inode.
+         */
+        ext4_group_t    i_block_group;
+        __u32   i_state;                /* Dynamic state flags for ext4 */
+        ext4_lblk_t             i_dir_start_lookup;
+#ifdef CONFIG_EXT4_FS_XATTR
+        /*
+         * Extended attributes can be read independently of the main file
+         * data. Taking i_mutex even when reading would cause contention
+         * between readers of EAs and writers of regular file data, so
+         * instead we synchronize on xattr_sem when reading or changing
+         * EAs.
+         */
+        struct rw_semaphore xattr_sem;
+#endif
+        struct list_head i_orphan;      /* unlinked but open inodes */
+        /*
+         * i_disksize keeps track of what the inode size is ON DISK, not
+         * in memory.  During truncate, i_size is set to the new size by
+         * the VFS prior to calling ext4_truncate(), but the filesystem won't
+         * set i_disksize to 0 until the truncate is actually under way.
+         *
+         * The intent is that i_disksize always represents the blocks which
+         * are used by this file.  This allows recovery to restart truncate
+         * on orphans if we crash during truncate.  We actually write i_disksize
+         * into the on-disk inode when writing inodes out, instead of i_size.
+         *
+         * The only time when i_disksize and i_size may be different is when
+         * a truncate is in progress.  The only things which change i_disksize
+         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+         */
+        loff_t  i_disksize;
+        /*
+         * i_data_sem is for serialising ext4_truncate() against
+         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+         * data tree are chopped off during truncate. We can't do that in
+         * ext4 because whenever we perform intermediate commits during
+         * truncate, the inode and all the metadata blocks *must* be in a
+         * consistent state which allows truncation of the orphans to restart
+         * during recovery.  Hence we must fix the get_block-vs-truncate race
+         * by other means, so we have i_data_sem.
+         */
+        struct rw_semaphore i_data_sem;
+        struct inode vfs_inode;
+        struct jbd2_inode jinode;
+        struct ext4_ext_cache i_cached_extent;
+        /*
+         * File creation time. Its function is same as that of
+         * struct timespec i_{a,c,m}time in the generic inode.
+         */
+        struct timespec i_crtime;
+        /* mballoc */
+        struct list_head i_prealloc_list;
+        spinlock_t i_prealloc_lock;
+        /* ialloc */
+        ext4_group_t    i_last_alloc_group;
+        /* allocation reservation info for delalloc */
+        unsigned int i_reserved_data_blocks;
+        unsigned int i_reserved_meta_blocks;
+        unsigned int i_allocated_meta_blocks;
+        unsigned short i_delalloc_reserved_flag;
+        /* on-disk additional length */
+        __u16 i_extra_isize;
+        spinlock_t i_block_reservation_lock;
+};
+/*
 * File system states
 */
 #define EXT4_VALID_FS                   0x0001  /* Unmounted cleanly */
@@ -540,7 +680,6 @@ do {									       \
 #define EXT4_MOUNT_ERRORS_PANIC         0x00040 /* Panic on errors */
 #define EXT4_MOUNT_MINIX_DF             0x00080 /* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD               0x00100 /* Don't use existing journal*/
-#define EXT4_MOUNT_ABORT                0x00200 /* Fatal error detected */
 #define EXT4_MOUNT_DATA_FLAGS           0x00C00 /* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA         0x00400 /* Write data to journal */
 #define EXT4_MOUNT_ORDERED_DATA         0x00800 /* Flush data before commit */
@@ -560,18 +699,12 @@ do {									       \
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
-/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
-#ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
 #define test_opt(sb, opt)               (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)
-#else
-#define EXT2_MOUNT_NOLOAD               EXT4_MOUNT_NOLOAD
-#define EXT2_MOUNT_ABORT                EXT4_MOUNT_ABORT
-#define EXT2_MOUNT_DATA_FLAGS           EXT4_MOUNT_DATA_FLAGS
-#endif
 #define ext4_set_bit                    ext2_set_bit
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
@@ -689,6 +822,146 @@ struct ext4_super_block {
 };
 #ifdef __KERNEL__
+/*
+ * run-time mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED  0x0001
+#define EXT4_MF_FS_ABORTED      0x0002  /* Fatal error detected */
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
+        unsigned long s_inodes_per_block;/* Number of inodes per block */
+        unsigned long s_blocks_per_group;/* Number of blocks in a group */
+        unsigned long s_inodes_per_group;/* Number of inodes in a group */
+        unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
+        unsigned long s_gdb_count;      /* Number of group descriptor blocks */
+        unsigned long s_desc_per_block; /* Number of group descriptors per block */
+        ext4_group_t s_groups_count;    /* Number of groups in the fs */
+        unsigned long s_overhead_last;  /* Last calculated overhead */
+        unsigned long s_blocks_last;    /* Last seen block count */
+        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
+        struct buffer_head * s_sbh;     /* Buffer containing the super block */
+        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
+        struct buffer_head **s_group_desc;
+        unsigned int s_mount_opt;
+        unsigned int s_mount_flags;
+        ext4_fsblk_t s_sb_block;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned short s_mount_state;
+        unsigned short s_pad;
+        int s_addr_per_block_bits;
+        int s_desc_per_block_bits;
+        int s_inode_size;
+        int s_first_ino;
+        unsigned int s_inode_readahead_blks;
+        unsigned int s_inode_goal;
+        spinlock_t s_next_gen_lock;
+        u32 s_next_generation;
+        u32 s_hash_seed[4];
+        int s_def_hash_version;
+        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
+        struct percpu_counter s_freeblocks_counter;
+        struct percpu_counter s_freeinodes_counter;
+        struct percpu_counter s_dirs_counter;
+        struct percpu_counter s_dirtyblocks_counter;
+        struct blockgroup_lock *s_blockgroup_lock;
+        struct proc_dir_entry *s_proc;
+        struct kobject s_kobj;
+        struct completion s_kobj_unregister;
+        /* Journaling */
+        struct inode *s_journal_inode;
+        struct journal_s *s_journal;
+        struct list_head s_orphan;
+        struct mutex s_orphan_lock;
+        struct mutex s_resize_lock;
+        unsigned long s_commit_interval;
+        u32 s_max_batch_time;
+        u32 s_min_batch_time;
+        struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+        wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
+        int s_jquota_fmt;                       /* Format of quota to use */
+#endif
+        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+        struct rb_root system_blks;
+#ifdef EXTENTS_STATS
+        /* ext4 extents stats */
+        unsigned long s_ext_min;
+        unsigned long s_ext_max;
+        unsigned long s_depth_max;
+        spinlock_t s_ext_stats_lock;
+        unsigned long s_ext_blocks;
+        unsigned long s_ext_extents;
+#endif
+        /* for buddy allocator */
+        struct ext4_group_info ***s_group_info;
+        struct inode *s_buddy_cache;
+        long s_blocks_reserved;
+        spinlock_t s_reserve_lock;
+        spinlock_t s_md_lock;
+        tid_t s_last_transaction;
+        unsigned short *s_mb_offsets;
+        unsigned int *s_mb_maxs;
+        /* tunables */
+        unsigned long s_stripe;
+        unsigned int s_mb_stream_request;
+        unsigned int s_mb_max_to_scan;
+        unsigned int s_mb_min_to_scan;
+        unsigned int s_mb_stats;
+        unsigned int s_mb_order2_reqs;
+        unsigned int s_mb_group_prealloc;
+        /* where last allocation was done - for stream allocation */
+        unsigned long s_mb_last_group;
+        unsigned long s_mb_last_start;
+        /* history to debug policy */
+        struct ext4_mb_history *s_mb_history;
+        int s_mb_history_cur;
+        int s_mb_history_max;
+        int s_mb_history_num;
+        spinlock_t s_mb_history_lock;
+        int s_mb_history_filter;
+        /* stats for buddy allocator */
+        spinlock_t s_mb_pa_lock;
+        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
+        atomic_t s_bal_success; /* we found long enough chunks */
+        atomic_t s_bal_allocated;       /* in blocks */
+        atomic_t s_bal_ex_scanned;      /* total extents scanned */
+        atomic_t s_bal_goals;   /* goal hits */
+        atomic_t s_bal_breaks;  /* too long searches */
+        atomic_t s_bal_2orders; /* 2^order hits */
+        spinlock_t s_bal_lock;
+        unsigned long s_mb_buddies_generated;
+        unsigned long long s_mb_generation_time;
+        atomic_t s_mb_lost_chunks;
+        atomic_t s_mb_preallocated;
+        atomic_t s_mb_discarded;
+        /* locality groups */
+        struct ext4_locality_group *s_locality_groups;
+        /* for write statistics */
+        unsigned long s_sectors_written_start;
+        u64 s_kbytes_written;
+        unsigned int s_log_groups_per_flex;
+        struct flex_groups *s_flex_groups;
+};
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -704,7 +977,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
                current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
        return ino == EXT4_ROOT_INO ||
@@ -1014,6 +1286,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+                                      ext4_group_t block_group);
+extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+                                       struct buffer_head *bh,
+                                       ext4_group_t group,
+                                       struct ext4_group_desc *desc);
+#define ext4_free_blocks_after_init(sb, group, desc)                    \
+                ext4_init_block_bitmap(sb, NULL, group, desc)
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1032,12 +1312,18 @@ extern int ext4fs_dirhash(const char *name, int len, struct
                          dx_hash_info *hinfo);
 /* ialloc.c */
-extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
+                                    const struct qstr *qstr, __u32 goal);
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+                                       struct buffer_head *bh,
+                                       ext4_group_t group,
+                                       struct ext4_group_desc *desc);
+extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1051,7 +1337,7 @@ extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
-                unsigned long, unsigned long, int, unsigned long *);
+                ext4_fsblk_t, unsigned long, int, unsigned long *);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
@@ -1123,6 +1409,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
                                const char *, const char *, ...)
        __attribute__ ((format (printf, 4, 5)));
@@ -1161,6 +1449,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
 extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+                                   struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+                                       struct ext4_group_desc *gdp);
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -1228,6 +1520,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
         return grp_info[indexv][indexh];
 }
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+        ext4_group_t    ngroups = EXT4_SB(sb)->s_groups_count;
+        smp_rmb();
+        return ngroups;
+}
 static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
@@ -1283,33 +1587,25 @@ struct ext4_group_info {
 };
 #define EXT4_GROUP_INFO_NEED_INIT_BIT   0
-#define EXT4_GROUP_INFO_LOCKED_BIT      1
 #define EXT4_MB_GRP_NEED_INIT(grp)      \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+                                              ext4_group_t group)
 {
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
-        bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
 }
-static inline void ext4_unlock_group(struct super_block *sb,
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-                                        ext4_group_t group)
 {
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        spin_lock(ext4_group_lock_ptr(sb, group));
-        bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
 }
-static inline int ext4_is_group_locked(struct super_block *sb,
+static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
 {
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        spin_unlock(ext4_group_lock_ptr(sb, group));
-        return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-                                                &(grinfo->bb_state));
 }
 /*
@@ -1326,11 +1622,21 @@ extern const struct file_operations ext4_file_operations;
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init init_ext4_system_zone(void);
+extern void exit_ext4_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+                                 ext4_fsblk_t start_blk,
+                                 unsigned int count);
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
@@ -1338,19 +1644,22 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                               ext4_lblk_t iblock, unsigned int max_blocks,
-                               struct buffer_head *bh_result,
+                               struct buffer_head *bh_result, int flags);
-                               int create, int extend_disksize);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
-extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
+extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
-                        sector_t block, unsigned int max_blocks,
+                           sector_t block, unsigned int max_blocks,
-                        struct buffer_head *bh, int create,
+                           struct buffer_head *bh, int flags);
-                        int extend_disksize, int flag);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
+/* move_extent.c */
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+                             __u64 start_orig, __u64 start_donor,
+                             __u64 len, __u64 *moved_len);
 /*
 * Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index f0c3ec85bd48..20a84105a10b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -221,12 +221,16 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 }
 extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
                                                   struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+                                      struct ext4_extent *ex1,
+                                      struct ext4_extent *ex2);
 extern int ext4_ext_try_to_merge(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644
index 4ce2187123aa..000000000000
--- a/fs/ext4/ext4_i.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- *  ext4_i.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_i.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-#ifndef _EXT4_I
-#define _EXT4_I
-#include <linux/rwsem.h>
-#include <linux/rbtree.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-/* data type for block offset of block group */
-typedef int ext4_grpblk_t;
-/* data type for filesystem-wide blocks number */
-typedef unsigned long long ext4_fsblk_t;
-/* data type for file logical block number */
-typedef __u32 ext4_lblk_t;
-/* data type for block group number */
-typedef unsigned int ext4_group_t;
-/*
- * storage for cached extent
- */
-struct ext4_ext_cache {
-        ext4_fsblk_t    ec_start;
-        ext4_lblk_t     ec_block;
-        __u32           ec_len; /* must be 32bit to return holes */
-        __u32           ec_type;
-};
-/*
- * fourth extended file system inode data in memory
- */
-struct ext4_inode_info {
-        __le32  i_data[15];     /* unconverted */
-        __u32   i_flags;
-        ext4_fsblk_t    i_file_acl;
-        __u32   i_dtime;
-        /*
-         * i_block_group is the number of the block group which contains
-         * this file's inode.  Constant across the lifetime of the inode,
-         * it is ued for making block allocation decisions - we try to
-         * place a file's data blocks near its inode block, and new inodes
-         * near to their parent directory's inode.
-         */
-        ext4_group_t    i_block_group;
-        __u32   i_state;                /* Dynamic state flags for ext4 */
-        ext4_lblk_t             i_dir_start_lookup;
-#ifdef CONFIG_EXT4_FS_XATTR
-        /*
-         * Extended attributes can be read independently of the main file
-         * data. Taking i_mutex even when reading would cause contention
-         * between readers of EAs and writers of regular file data, so
-         * instead we synchronize on xattr_sem when reading or changing
-         * EAs.
-         */
-        struct rw_semaphore xattr_sem;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        struct posix_acl        *i_acl;
-        struct posix_acl        *i_default_acl;
-#endif
-        struct list_head i_orphan;      /* unlinked but open inodes */
-        /*
-         * i_disksize keeps track of what the inode size is ON DISK, not
-         * in memory.  During truncate, i_size is set to the new size by
-         * the VFS prior to calling ext4_truncate(), but the filesystem won't
-         * set i_disksize to 0 until the truncate is actually under way.
-         *
-         * The intent is that i_disksize always represents the blocks which
-         * are used by this file.  This allows recovery to restart truncate
-         * on orphans if we crash during truncate.  We actually write i_disksize
-         * into the on-disk inode when writing inodes out, instead of i_size.
-         *
-         * The only time when i_disksize and i_size may be different is when
-         * a truncate is in progress.  The only things which change i_disksize
-         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
-         */
-        loff_t  i_disksize;
-        /*
-         * i_data_sem is for serialising ext4_truncate() against
-         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
-         * data tree are chopped off during truncate. We can't do that in
-         * ext4 because whenever we perform intermediate commits during
-         * truncate, the inode and all the metadata blocks *must* be in a
-         * consistent state which allows truncation of the orphans to restart
-         * during recovery.  Hence we must fix the get_block-vs-truncate race
-         * by other means, so we have i_data_sem.
-         */
-        struct rw_semaphore i_data_sem;
-        struct inode vfs_inode;
-        struct jbd2_inode jinode;
-        struct ext4_ext_cache i_cached_extent;
-        /*
-         * File creation time. Its function is same as that of
-         * struct timespec i_{a,c,m}time in the generic inode.
-         */
-        struct timespec i_crtime;
-        /* mballoc */
-        struct list_head i_prealloc_list;
-        spinlock_t i_prealloc_lock;
-        /* ialloc */
-        ext4_group_t    i_last_alloc_group;
-        /* allocation reservation info for delalloc */
-        unsigned int i_reserved_data_blocks;
-        unsigned int i_reserved_meta_blocks;
-        unsigned int i_allocated_meta_blocks;
-        unsigned short i_delalloc_reserved_flag;
-        /* on-disk additional length */
-        __u16 i_extra_isize;
-        spinlock_t i_block_reservation_lock;
-};
-#endif  /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index ad13a84644e1..eb27fd0f2ee8 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,8 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
                        ext4_journal_abort_handle(where, __func__, bh,
                                                  handle, err);
        }
+        else
+                brelse(bh);
        return err;
 }
@@ -57,6 +59,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
                        ext4_journal_abort_handle(where, __func__, bh,
                                                  handle, err);
        }
+        else
+                brelse(bh);
        return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index be2f426f6805..139fb8cb87e4 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -131,9 +131,11 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
                                struct buffer_head *bh);
+/* When called with an invalid handle, this will still do a put on the BH */
 int __ext4_journal_forget(const char *where, handle_t *handle,
                                struct buffer_head *bh);
+/* When called with an invalid handle, this will still do a put on the BH */
 int __ext4_journal_revoke(const char *where, handle_t *handle,
                                ext4_fsblk_t blocknr, struct buffer_head *bh);
@@ -281,10 +283,10 @@ static inline int ext4_should_order_data(struct inode *inode)
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-        if (EXT4_JOURNAL(inode) == NULL)
-                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
+        if (EXT4_JOURNAL(inode) == NULL)
+                return 1;
        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644
index 57b71fefbccf..000000000000
--- a/fs/ext4/ext4_sb.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- *  ext4_sb.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_sb.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-#ifndef _EXT4_SB
-#define _EXT4_SB
-#ifdef __KERNEL__
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
-#endif
-#include <linux/rbtree.h>
-/*
- * fourth extended-fs super-block data in memory
- */
-struct ext4_sb_info {
-        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
-        unsigned long s_inodes_per_block;/* Number of inodes per block */
-        unsigned long s_blocks_per_group;/* Number of blocks in a group */
-        unsigned long s_inodes_per_group;/* Number of inodes in a group */
-        unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
-        unsigned long s_gdb_count;      /* Number of group descriptor blocks */
-        unsigned long s_desc_per_block; /* Number of group descriptors per block */
-        ext4_group_t s_groups_count;    /* Number of groups in the fs */
-        unsigned long s_overhead_last;  /* Last calculated overhead */
-        unsigned long s_blocks_last;    /* Last seen block count */
-        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
-        struct buffer_head * s_sbh;     /* Buffer containing the super block */
-        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
-        struct buffer_head **s_group_desc;
-        unsigned long  s_mount_opt;
-        ext4_fsblk_t s_sb_block;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned short s_mount_state;
-        unsigned short s_pad;
-        int s_addr_per_block_bits;
-        int s_desc_per_block_bits;
-        int s_inode_size;
-        int s_first_ino;
-        unsigned int s_inode_readahead_blks;
-        spinlock_t s_next_gen_lock;
-        u32 s_next_generation;
-        u32 s_hash_seed[4];
-        int s_def_hash_version;
-        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
-        struct percpu_counter s_freeblocks_counter;
-        struct percpu_counter s_freeinodes_counter;
-        struct percpu_counter s_dirs_counter;
-        struct percpu_counter s_dirtyblocks_counter;
-        struct blockgroup_lock *s_blockgroup_lock;
-        struct proc_dir_entry *s_proc;
-        struct kobject s_kobj;
-        struct completion s_kobj_unregister;
-        /* Journaling */
-        struct inode *s_journal_inode;
-        struct journal_s *s_journal;
-        struct list_head s_orphan;
-        unsigned long s_commit_interval;
-        u32 s_max_batch_time;
-        u32 s_min_batch_time;
-        struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
-        wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
-#endif
-#ifdef CONFIG_QUOTA
-        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
-        int s_jquota_fmt;                       /* Format of quota to use */
-#endif
-        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-#ifdef EXTENTS_STATS
-        /* ext4 extents stats */
-        unsigned long s_ext_min;
-        unsigned long s_ext_max;
-        unsigned long s_depth_max;
-        spinlock_t s_ext_stats_lock;
-        unsigned long s_ext_blocks;
-        unsigned long s_ext_extents;
-#endif
-        /* for buddy allocator */
-        struct ext4_group_info ***s_group_info;
-        struct inode *s_buddy_cache;
-        long s_blocks_reserved;
-        spinlock_t s_reserve_lock;
-        spinlock_t s_md_lock;
-        tid_t s_last_transaction;
-        unsigned short *s_mb_offsets;
-        unsigned int *s_mb_maxs;
-        /* tunables */
-        unsigned long s_stripe;
-        unsigned int s_mb_stream_request;
-        unsigned int s_mb_max_to_scan;
-        unsigned int s_mb_min_to_scan;
-        unsigned int s_mb_stats;
-        unsigned int s_mb_order2_reqs;
-        unsigned int s_mb_group_prealloc;
-        /* where last allocation was done - for stream allocation */
-        unsigned long s_mb_last_group;
-        unsigned long s_mb_last_start;
-        /* history to debug policy */
-        struct ext4_mb_history *s_mb_history;
-        int s_mb_history_cur;
-        int s_mb_history_max;
-        int s_mb_history_num;
-        spinlock_t s_mb_history_lock;
-        int s_mb_history_filter;
-        /* stats for buddy allocator */
-        spinlock_t s_mb_pa_lock;
-        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
-        atomic_t s_bal_success; /* we found long enough chunks */
-        atomic_t s_bal_allocated;       /* in blocks */
-        atomic_t s_bal_ex_scanned;      /* total extents scanned */
-        atomic_t s_bal_goals;   /* goal hits */
-        atomic_t s_bal_breaks;  /* too long searches */
-        atomic_t s_bal_2orders; /* 2^order hits */
-        spinlock_t s_bal_lock;
-        unsigned long s_mb_buddies_generated;
-        unsigned long long s_mb_generation_time;
-        atomic_t s_mb_lost_chunks;
-        atomic_t s_mb_preallocated;
-        atomic_t s_mb_discarded;
-        /* locality groups */
-        struct ext4_locality_group *s_locality_groups;
-        /* for write statistics */
-        unsigned long s_sectors_written_start;
-        u64 s_kbytes_written;
-        unsigned int s_log_groups_per_flex;
-        struct flex_groups *s_flex_groups;
-};
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-        return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-#endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e3a55eb8b26a..73ebfb44ad75 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -49,7 +49,7 @@
 * ext_pblock:
 * combine low and high parts of physical block number into ext4_fsblk_t
 */
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
 {
        ext4_fsblk_t block;
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-        ext4_fsblk_t block = ext_pblock(ext), valid_block;
+        ext4_fsblk_t block = ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
-        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-        valid_block = le32_to_cpu(es->s_first_data_block) +
+        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
-                EXT4_SB(inode->i_sb)->s_gdb_count;
-        if (unlikely(block <= valid_block ||
-                     ((block + len) > ext4_blocks_count(es))))
-                return 0;
-        else
-                return 1;
 }
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-        ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
+        ext4_fsblk_t block = idx_pblock(ext_idx);
-        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-        valid_block = le32_to_cpu(es->s_first_data_block) +
+        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
-                EXT4_SB(inode->i_sb)->s_gdb_count;
-        if (unlikely(block <= valid_block ||
-                     (block >= ext4_blocks_count(es))))
-                return 0;
-        else
-                return 1;
 }
 static int ext4_valid_extent_entries(struct inode *inode,
@@ -1431,7 +1417,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
        return err;
 }
-static int
+int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
                                struct ext4_extent *ex2)
 {
@@ -1991,6 +1977,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                         */
                        /* 1 bitmap, 1 block group descriptor */
                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+                        return ret;
                }
        }
@@ -2097,12 +2084,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex = EXT_LAST_EXTENT(eh);
        ex_ee_block = le32_to_cpu(ex->ee_block);
-        if (ext4_ext_is_uninitialized(ex))
-                uninitialized = 1;
        ex_ee_len = ext4_ext_get_actual_len(ex);
        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {
+                if (ext4_ext_is_uninitialized(ex))
+                        uninitialized = 1;
+                else
+                        uninitialized = 0;
                ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
                path[depth].p_ext = ex;
@@ -2784,7 +2775,7 @@ fix_extent_len:
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned int max_blocks, struct buffer_head *bh_result,
-                        int create, int extend_disksize)
+                        int flags)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
@@ -2793,7 +2784,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
-        loff_t disksize;
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2803,7 +2793,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
        if (cache_type) {
                if (cache_type == EXT4_EXT_CACHE_GAP) {
-                        if (!create) {
+                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
                                 * user doesn't want to allocate it
@@ -2869,9 +2859,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
-                        if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+                        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
                                goto out;
-                        if (!create) {
+                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+                                if (allocated > max_blocks)
+                                        allocated = max_blocks;
                                /*
                                 * We have blocks reserved already.  We
                                 * return allocated blocks so that delalloc
@@ -2879,8 +2871,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                 * the buffer head will be unmapped so that
                                 * a read from the block returns 0s.
                                 */
-                                if (allocated > max_blocks)
-                                        allocated = max_blocks;
                                set_buffer_unwritten(bh_result);
                                bh_result->b_bdev = inode->i_sb->s_bdev;
                                bh_result->b_blocknr = newblock;
@@ -2903,7 +2893,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * requested block isn't allocated yet;
         * we couldn't try to create block if create flag is zero
         */
-        if (!create) {
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                /*
                 * put just found gap into cache to speed up
                 * subsequent requests
@@ -2932,10 +2922,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * EXT_UNINIT_MAX_LEN.
         */
        if (max_blocks > EXT_INIT_MAX_LEN &&
-            create != EXT4_CREATE_UNINITIALIZED_EXT)
+            !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
                max_blocks = EXT_INIT_MAX_LEN;
        else if (max_blocks > EXT_UNINIT_MAX_LEN &&
-                 create == EXT4_CREATE_UNINITIALIZED_EXT)
+                 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
                max_blocks = EXT_UNINIT_MAX_LEN;
        /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2956,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
        newex.ee_len = cpu_to_le16(ar.len);
-        if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
+        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
                ext4_ext_mark_uninitialized(&newex);
        err = ext4_ext_insert_extent(handle, inode, path, &newex);
        if (err) {
@@ -2983,18 +2973,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-        if (extend_disksize) {
-                disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
-                if (disksize > i_size_read(inode))
-                        disksize = i_size_read(inode);
-                if (disksize > EXT4_I(inode)->i_disksize)
-                        EXT4_I(inode)->i_disksize = disksize;
-        }
        set_buffer_new(bh_result);
        /* Cache only when it is _not_ an uninitialized extent */
-        if (create != EXT4_CREATE_UNINITIALIZED_EXT)
+        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
 out:
@@ -3150,9 +3132,10 @@ retry:
                        ret = PTR_ERR(handle);
                        break;
                }
-                ret = ext4_get_blocks_wrap(handle, inode, block,
+                map_bh.b_state = 0;
-                                          max_blocks, &map_bh,
+                ret = ext4_get_blocks(handle, inode, block,
-                                          EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+                                      max_blocks, &map_bh,
+                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
@@ -3195,7 +3178,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                       void *data)
 {
        struct fiemap_extent_info *fieinfo = data;
-        unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        __u64   logical;
        __u64   physical;
        __u64   length;
@@ -3242,9 +3225,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
         *
         * XXX this might miss a single-block extent at EXT_MAX_BLOCK
         */
-        if (logical + length - 1 == EXT_MAX_BLOCK ||
+        if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-            ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+            newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
+                loff_t size = i_size_read(inode);
+                loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
                flags |= FIEMAP_EXTENT_LAST;
+                if ((flags & FIEMAP_EXTENT_DELALLOC) &&
+                    logical+length > size)
+                        length = (size - logical + bs - 1) & ~(bs-1);
+        }
        error = fiemap_fill_next_extent(fieinfo, logical, physical,
                                        length, flags);
@@ -3318,10 +3308,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 * Walk the extent tree gathering extent information.
                 * ext4_ext_fiemap_cb will push extents back to user.
                 */
-                down_write(&EXT4_I(inode)->i_data_sem);
+                down_read(&EXT4_I(inode)->i_data_sem);
                error = ext4_ext_walk_space(inode, start_blk, len_blks,
                                          ext4_ext_fiemap_cb, fieinfo);
-                up_write(&EXT4_I(inode)->i_data_sem);
+                up_read(&EXT4_I(inode)->i_data_sem);
        }
        return error;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 588af8c77246..3f1873fef1c6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -21,6 +21,8 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/mount.h>
+#include <linux/path.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -145,6 +147,38 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+        struct super_block *sb = inode->i_sb;
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        struct vfsmount *mnt = filp->f_path.mnt;
+        struct path path;
+        char buf[64], *cp;
+        if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+                     !(sb->s_flags & MS_RDONLY))) {
+                sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+                /*
+                 * Sample where the filesystem has been mounted and
+                 * store it in the superblock for sysadmin convenience
+                 * when trying to sort through large numbers of block
+                 * devices or filesystem images.
+                 */
+                memset(buf, 0, sizeof(buf));
+                path.mnt = mnt->mnt_parent;
+                path.dentry = mnt->mnt_mountpoint;
+                path_get(&path);
+                cp = d_path(&path, buf, sizeof(buf));
+                path_put(&path);
+                if (!IS_ERR(cp)) {
+                        memcpy(sbi->s_es->s_last_mounted, cp,
+                               sizeof(sbi->s_es->s_last_mounted));
+                        sb->s_dirt = 1;
+                }
+        }
+        return generic_file_open(inode, filp);
+}
 const struct file_operations ext4_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -156,7 +190,7 @@ const struct file_operations ext4_file_operations = {
        .compat_ioctl   = ext4_compat_ioctl,
 #endif
        .mmap           = ext4_file_mmap,
-        .open           = generic_file_open,
+        .open           = ext4_file_open,
        .release        = ext4_release_file,
        .fsync          = ext4_sync_file,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5afe4370840b..83cf6415f599 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,10 +28,12 @@
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
 #include <linux/blkdev.h>
-#include <linux/marker.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
+#include <trace/events/ext4.h>
 /*
 * akpm: A new design for ext4_sync_file().
 *
@@ -52,9 +54,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        J_ASSERT(ext4_journal_current_handle() == NULL);
-        trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
+        trace_ext4_sync_file(file, dentry, datasync);
-                   inode->i_sb->s_id, datasync, inode->i_ino,
-                   dentry->d_parent->d_inode->i_ino);
        /*
         * data=writeback:
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index c2c0a8d06d0e..000000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-                                   struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
-                                       struct ext4_group_desc *gdp);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-                                      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-                                       struct buffer_head *bh,
-                                       ext4_group_t group,
-                                       struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)                    \
-                ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                       struct buffer_head *bh,
-                                       ext4_group_t group,
-                                       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a08a6b5..29e6dc7299b8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -23,11 +23,13 @@
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <asm/byteorder.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
+#include <trace/events/ext4.h>
 /*
 * ialloc.c contains the inodes allocation and deallocation routines
@@ -123,16 +125,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
-        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
-        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -209,11 +211,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        ino = inode->i_ino;
        ext4_debug("freeing inode %lu\n", ino);
-        trace_mark(ext4_free_inode,
+        trace_ext4_free_inode(inode);
-                   "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
-                   sb->s_id, inode->i_ino, inode->i_mode,
-                   (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
-                   (unsigned long long) inode->i_blocks);
        /*
         * Note: we must free any quota before locking the superblock,
@@ -247,9 +245,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                goto error_return;
        /* Ok, now we can actually update the inode bitmaps.. */
-        spin_lock(sb_bgl_lock(sbi, block_group));
+        cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+                                        bit, bitmap_bh->b_data);
-        spin_unlock(sb_bgl_lock(sbi, block_group));
        if (!cleared)
                ext4_error(sb, "ext4_free_inode",
                           "bit already cleared for inode %lu", ino);
@@ -261,7 +258,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                if (fatal) goto error_return;
                if (gdp) {
-                        spin_lock(sb_bgl_lock(sbi, block_group));
+                        ext4_lock_group(sb, block_group);
                        count = ext4_free_inodes_count(sb, gdp) + 1;
                        ext4_free_inodes_set(sb, gdp, count);
                        if (is_directory) {
@@ -277,7 +274,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                        }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
-                        spin_unlock(sb_bgl_lock(sbi, block_group));
+                        ext4_unlock_group(sb, block_group);
                        percpu_counter_inc(&sbi->s_freeinodes_counter);
                        if (is_directory)
                                percpu_counter_dec(&sbi->s_dirs_counter);
@@ -316,7 +313,7 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent,
                                ext4_group_t *best_group)
 {
-        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        unsigned int freei, avefreei;
        struct ext4_group_desc *desc, *best_desc = NULL;
        ext4_group_t group;
@@ -349,11 +346,10 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *desc;
-        struct buffer_head *bh;
        struct flex_groups *flex_group = sbi->s_flex_groups;
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-        ext4_group_t ngroups = sbi->s_groups_count;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        int flex_size = ext4_flex_bg_size(sbi);
        ext4_group_t best_flex = parent_fbg_group;
        int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +358,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
        ext4_group_t n_fbg_groups;
        ext4_group_t i;
-        n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+        n_fbg_groups = (ngroups + flex_size - 1) >>
                sbi->s_log_groups_per_flex;
 find_close_to_parent:
@@ -404,7 +400,7 @@ find_close_to_parent:
 found_flexbg:
        for (i = best_flex * flex_size; i < ngroups &&
                     i < (best_flex + 1) * flex_size; i++) {
-                desc = ext4_get_group_desc(sb, i, &bh);
+                desc = ext4_get_group_desc(sb, i, NULL);
                if (ext4_free_inodes_count(sb, desc)) {
                        *best_group = i;
                        goto out;
@@ -474,24 +470,27 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 */
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-                            ext4_group_t *group, int mode)
+                            ext4_group_t *group, int mode,
+                            const struct qstr *qstr)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        ext4_group_t ngroups = sbi->s_groups_count;
+        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
        ext4_fsblk_t freeb, avefreeb;
        unsigned int ndirs;
        int max_dirs, min_inodes;
        ext4_grpblk_t min_blocks;
-        ext4_group_t i, grp, g;
+        ext4_group_t i, grp, g, ngroups;
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
        int flex_size = ext4_flex_bg_size(sbi);
+        struct dx_hash_info hinfo;
+        ngroups = real_ngroups;
        if (flex_size > 1) {
-                ngroups = (ngroups + flex_size - 1) >>
+                ngroups = (real_ngroups + flex_size - 1) >>
                        sbi->s_log_groups_per_flex;
                parent_group >>= sbi->s_log_groups_per_flex;
        }
@@ -509,7 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                int best_ndir = inodes_per_group;
                int ret = -1;
-                get_random_bytes(&grp, sizeof(grp));
+                if (qstr) {
+                        hinfo.hash_version = DX_HASH_HALF_MD4;
+                        hinfo.seed = sbi->s_hash_seed;
+                        ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+                        grp = hinfo.hash;
+                } else
+                        get_random_bytes(&grp, sizeof(grp));
                parent_group = (unsigned)grp % ngroups;
                for (i = 0; i < ngroups; i++) {
                        g = (parent_group + i) % ngroups;
@@ -543,7 +548,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                 */
                grp *= flex_size;
                for (i = 0; i < flex_size; i++) {
-                        if (grp+i >= sbi->s_groups_count)
+                        if (grp+i >= real_ngroups)
                                break;
                        desc = ext4_get_group_desc(sb, grp+i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +588,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        }
 fallback:
-        ngroups = sbi->s_groups_count;
+        ngroups = real_ngroups;
        avefreei = freei / ngroups;
 fallback_retry:
        parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +618,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
-        ext4_group_t i, last;
        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
        /*
@@ -653,7 +657,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                *group = parent_group + flex_size;
                if (*group > ngroups)
                        *group = 0;
-                return find_group_orlov(sb, parent, group, mode);
+                return find_group_orlov(sb, parent, group, mode, 0);
        }
        /*
@@ -708,10 +712,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 /*
 * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's sb_bgl_lock
+ * is uninit we need to take the groups's ext4_group_lock
 * and clear the uninit flag. The inode bitmap update
 * and group desc uninit flag clear should be done
- * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * after holding ext4_group_lock so that ext4_read_inode_bitmap
 * doesn't race with the ext4_claim_inode
 */
 static int ext4_claim_inode(struct super_block *sb,
@@ -722,7 +726,7 @@ static int ext4_claim_inode(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
-        spin_lock(sb_bgl_lock(sbi, group));
+        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
                retval = 1;
@@ -731,7 +735,7 @@ static int ext4_claim_inode(struct super_block *sb,
        ino++;
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
-                spin_unlock(sb_bgl_lock(sbi, group));
+                ext4_unlock_group(sb, group);
                ext4_error(sb, __func__,
                           "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
@@ -780,7 +784,7 @@ static int ext4_claim_inode(struct super_block *sb,
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
-        spin_unlock(sb_bgl_lock(sbi, group));
+        ext4_unlock_group(sb, group);
        return retval;
 }
@@ -794,16 +798,16 @@ err_ret:
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
+                             const struct qstr *qstr, __u32 goal)
 {
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
        struct buffer_head *group_desc_bh;
-        ext4_group_t group = 0;
+        ext4_group_t ngroups, group = 0;
        unsigned long ino = 0;
        struct inode *inode;
        struct ext4_group_desc *gdp = NULL;
-        struct ext4_super_block *es;
        struct ext4_inode_info *ei;
        struct ext4_sb_info *sbi;
        int ret2, err = 0;
@@ -818,15 +822,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                return ERR_PTR(-EPERM);
        sb = dir->i_sb;
-        trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
+        ngroups = ext4_get_groups_count(sb);
-                   dir->i_ino, mode);
+        trace_ext4_request_inode(dir, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        ei = EXT4_I(inode);
        sbi = EXT4_SB(sb);
-        es = sbi->s_es;
+        if (!goal)
+                goal = sbi->s_inode_goal;
+        if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
+                group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+                ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+                ret2 = 0;
+                goto got_group;
+        }
        if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
                ret2 = find_group_flex(sb, dir, &group);
@@ -846,7 +858,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                if (test_opt(sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
                else
-                        ret2 = find_group_orlov(sb, dir, &group, mode);
+                        ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
        } else
                ret2 = find_group_other(sb, dir, &group, mode);
@@ -856,7 +868,7 @@ got_group:
        if (ret2 == -1)
                goto out;
-        for (i = 0; i < sbi->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++, ino = 0) {
                err = -EIO;
                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -868,8 +880,6 @@ got_group:
                if (!inode_bitmap_bh)
                        goto fail;
-                ino = 0;
 repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
                                              inode_bitmap_bh->b_data,
@@ -917,7 +927,7 @@ repeat_in_this_group:
                 * group descriptor metadata has not yet been updated.
                 * So we just go onto the next blockgroup.
                 */
-                if (++group == sbi->s_groups_count)
+                if (++group == ngroups)
                        group = 0;
        }
        err = -ENOSPC;
@@ -938,7 +948,7 @@ got:
                }
                free = 0;
-                spin_lock(sb_bgl_lock(sbi, group));
+                ext4_lock_group(sb, group);
                /* recheck and clear flag under lock if we still need to */
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -947,7 +957,7 @@ got:
                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
                                                                gdp);
                }
-                spin_unlock(sb_bgl_lock(sbi, group));
+                ext4_unlock_group(sb, group);
                /* Don't need to dirty bitmap block if we didn't change it */
                if (free) {
@@ -1052,8 +1062,7 @@ got:
        }
        ext4_debug("allocating inode %lu\n", inode->i_ino);
-        trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
+        trace_ext4_allocate_inode(inode, dir, mode);
-                   sb->s_id, inode->i_ino, dir->i_ino, mode);
        goto really_out;
 fail:
        ext4_std_error(sb, err);
@@ -1158,7 +1167,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
        unsigned long desc_count;
        struct ext4_group_desc *gdp;
-        ext4_group_t i;
+        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        unsigned long bitmap_count, x;
@@ -1168,7 +1177,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
@@ -1190,7 +1199,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
        return desc_count;
 #else
        desc_count = 0;
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
@@ -1205,9 +1214,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 unsigned long ext4_count_dirs(struct super_block * sb)
 {
        unsigned long count = 0;
-        ext4_group_t i;
+        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a9ffd528dd1..f9c642b22efa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,11 +37,14 @@
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "ext4_extents.h"
+#include <trace/events/ext4.h>
 #define MPAGE_DA_EXTENT_TAIL 0x01
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
@@ -75,22 +78,20 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 * but there may still be a record of it in the journal, and that record
 * still needs to be revoked.
 *
- * If the handle isn't valid we're not journaling so there's nothing to do.
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
 */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                        struct buffer_head *bh, ext4_fsblk_t blocknr)
+                struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
        int err;
-        if (!ext4_handle_valid(handle))
-                return 0;
        might_sleep();
        BUFFER_TRACE(bh, "enter");
        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-                  "data mode %lx\n",
+                  "data mode %x\n",
                  bh, is_metadata, inode->i_mode,
                  test_opt(inode->i_sb, DATA_FLAGS));
@@ -329,8 +330,8 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 */
 static int ext4_block_to_path(struct inode *inode,
-                        ext4_lblk_t i_block,
+                              ext4_lblk_t i_block,
-                        ext4_lblk_t offsets[4], int *boundary)
+                              ext4_lblk_t offsets[4], int *boundary)
 {
        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -362,9 +363,9 @@ static int ext4_block_to_path(struct inode *inode,
                final = ptrs;
        } else {
                ext4_warning(inode->i_sb, "ext4_block_to_path",
-                                "block %lu > max in inode %lu",
+                             "block %lu > max in inode %lu",
-                                i_block + direct_blocks +
+                             i_block + direct_blocks +
-                                indirect_blocks + double_blocks, inode->i_ino);
+                             indirect_blocks + double_blocks, inode->i_ino);
        }
        if (boundary)
                *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -372,31 +373,32 @@ static int ext4_block_to_path(struct inode *inode,
 }
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-                                 __le32 *p, unsigned int max) {
+                                 __le32 *p, unsigned int max)
+{
-        unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
        __le32 *bref = p;
+        unsigned int blk;
        while (bref < p+max) {
-                if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
+                blk = le32_to_cpu(*bref++);
+                if (blk &&
+                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+                                                    blk, 1))) {
                        ext4_error(inode->i_sb, function,
-                                   "block reference %u >= max (%u) "
+                                   "invalid block reference %u "
-                                   "in inode #%lu, offset=%d",
+                                   "in inode #%lu", blk, inode->i_ino);
-                                   le32_to_cpu(*bref), maxblocks,
+                        return -EIO;
-                                   inode->i_ino, (int)(bref-p));
+                }
-                        return -EIO;
+        }
-                }
+        return 0;
-                bref++;
-        }
-        return 0;
 }
 #define ext4_check_indirect_blockref(inode, bh)                         \
-        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
                              EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 #define ext4_check_inode_blockref(inode)                                \
-        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
                              EXT4_NDIR_BLOCKS)
 /**
@@ -446,7 +448,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
                bh = sb_getblk(sb, le32_to_cpu(p->key));
                if (unlikely(!bh))
                        goto failure;
-                  
                if (!bh_uptodate_or_lock(bh)) {
                        if (bh_submit_read(bh) < 0) {
                                put_bh(bh);
@@ -458,7 +460,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
                                goto failure;
                        }
                }
-                
                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
@@ -551,7 +553,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 *      returns it.
 */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
-                Indirect *partial)
+                                   Indirect *partial)
 {
        /*
         * XXX need to get goal block from mballoc's data structures
@@ -573,7 +575,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 *      direct and indirect blocks.
 */
 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
-                int blocks_to_boundary)
+                                 int blocks_to_boundary)
 {
        unsigned int count = 0;
@@ -609,9 +611,9 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 *              direct blocks
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, ext4_fsblk_t goal,
+                             ext4_lblk_t iblock, ext4_fsblk_t goal,
-                                int indirect_blks, int blks,
+                             int indirect_blks, int blks,
-                                ext4_fsblk_t new_blocks[4], int *err)
+                             ext4_fsblk_t new_blocks[4], int *err)
 {
        struct ext4_allocation_request ar;
        int target, i;
@@ -682,10 +684,10 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        }
        if (!*err) {
                if (target == blks) {
-                /*
+                        /*
-                 * save the new block number
+                         * save the new block number
-                 * for the first direct block
+                         * for the first direct block
-                 */
+                         */
                        new_blocks[index] = current_block;
                }
                blk_allocated += ar.len;
@@ -727,9 +729,9 @@ failed_out:
 *      as described above and return 0.
 */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, int indirect_blks,
+                             ext4_lblk_t iblock, int indirect_blks,
-                                int *blks, ext4_fsblk_t goal,
+                             int *blks, ext4_fsblk_t goal,
-                                ext4_lblk_t *offsets, Indirect *branch)
+                             ext4_lblk_t *offsets, Indirect *branch)
 {
        int blocksize = inode->i_sb->s_blocksize;
        int i, n = 0;
@@ -776,7 +778,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                         * the chain to point to the new allocated
                         * data blocks numbers
                         */
-                        for (i=1; i < num; i++)
+                        for (i = 1; i < num; i++)
                                *(branch[n].p + i) = cpu_to_le32(++current_block);
                }
                BUFFER_TRACE(bh, "marking uptodate");
@@ -819,7 +821,8 @@ failed:
 * chain to new block and return 0.
 */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t block, Indirect *where, int num, int blks)
+                              ext4_lblk_t block, Indirect *where, int num,
+                              int blks)
 {
        int i;
        int err = 0;
@@ -851,10 +854,6 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
        }
        /* We are done with atomic stuff, now do the rest of housekeeping */
-        inode->i_ctime = ext4_current_time(inode);
-        ext4_mark_inode_dirty(handle, inode);
        /* had we spliced it onto indirect block? */
        if (where->bh) {
                /*
@@ -873,8 +872,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
        } else {
                /*
                 * OK, we spliced it into the inode itself on a direct block.
-                 * Inode was dirtied above.
                 */
+                ext4_mark_inode_dirty(handle, inode);
                jbd_debug(5, "splicing direct\n");
        }
        return err;
@@ -892,6 +891,10 @@ err_out:
 }
 /*
+ * The ext4_ind_get_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_get_blocks().
+ *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
 * to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,15 +912,16 @@ err_out:
 * return = 0, if plain lookup failed.
 * return < 0, error case.
 *
- *
+ * The ext4_ind_get_blocks() function should be called with
- * Need to be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
 */
-static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
-                                  ext4_lblk_t iblock, unsigned int maxblocks,
+                               ext4_lblk_t iblock, unsigned int maxblocks,
-                                  struct buffer_head *bh_result,
+                               struct buffer_head *bh_result,
-                                  int create, int extend_disksize)
+                               int flags)
 {
        int err = -EIO;
        ext4_lblk_t offsets[4];
@@ -927,16 +931,13 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        int indirect_blks;
        int blocks_to_boundary = 0;
        int depth;
-        struct ext4_inode_info *ei = EXT4_I(inode);
        int count = 0;
        ext4_fsblk_t first_block = 0;
-        loff_t disksize;
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
-        J_ASSERT(handle != NULL || create == 0);
+        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, iblock, offsets,
-                                        &blocks_to_boundary);
+                                   &blocks_to_boundary);
        if (depth == 0)
                goto out;
@@ -963,7 +964,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        }
        /* Next simple case - plain lookup or failed read of indirect block */
-        if (!create || err == -EIO)
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
                goto cleanup;
        /*
@@ -984,8 +985,8 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
         * Block out ext4_truncate while we alter the tree
         */
        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
-                                        &count, goal,
+                                &count, goal,
-                                        offsets + (partial - chain), partial);
+                                offsets + (partial - chain), partial);
        /*
         * The ext4_splice_branch call will free and forget any buffers
@@ -996,20 +997,8 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
         */
        if (!err)
                err = ext4_splice_branch(handle, inode, iblock,
-                                        partial, indirect_blks, count);
+                                         partial, indirect_blks, count);
-        /*
+        else
-         * i_disksize growing is protected by i_data_sem.  Don't forget to
-         * protect it if you're about to implement concurrent
-         * ext4_get_block() -bzzz
-        */
-        if (!err && extend_disksize) {
-                disksize = ((loff_t) iblock + count) << inode->i_blkbits;
-                if (disksize > i_size_read(inode))
-                        disksize = i_size_read(inode);
-                if (disksize > ei->i_disksize)
-                        ei->i_disksize = disksize;
-        }
-        if (err)
                goto cleanup;
        set_buffer_new(bh_result);
@@ -1120,8 +1109,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
                ext4_discard_preallocations(inode);
 }
+static int check_block_validity(struct inode *inode, sector_t logical,
+                                sector_t phys, int len)
+{
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+                ext4_error(inode->i_sb, "check_block_validity",
+                           "inode #%lu logical block %llu mapped to %llu "
+                           "(size %d)", inode->i_ino,
+                           (unsigned long long) logical,
+                           (unsigned long long) phys, len);
+                WARN_ON(1);
+                return -EIO;
+        }
+        return 0;
+}
 /*
- * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * The ext4_get_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1129,7 +1133,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 * mapped.
 *
 * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocate.
@@ -1142,9 +1146,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 *
 * It returns the error in case of allocation failure.
 */
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
+int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
-                        unsigned int max_blocks, struct buffer_head *bh,
+                    unsigned int max_blocks, struct buffer_head *bh,
-                        int create, int extend_disksize, int flag)
+                    int flags)
 {
        int retval;
@@ -1152,21 +1156,28 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
        clear_buffer_unwritten(bh);
        /*
-         * Try to see if we can get  the block without requesting
+         * Try to see if we can get the block without requesting a new
-         * for new file system block.
+         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-                                bh, 0, 0);
+                                bh, 0);
        } else {
-                retval = ext4_get_blocks_handle(handle,
+                retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
-                                inode, block, max_blocks, bh, 0, 0);
+                                             bh, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
+        if (retval > 0 && buffer_mapped(bh)) {
+                int ret = check_block_validity(inode, block,
+                                               bh->b_blocknr, retval);
+                if (ret != 0)
+                        return ret;
+        }
        /* If it is only a block(s) look up */
-        if (!create)
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
                return retval;
        /*
@@ -1205,7 +1216,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         * let the underlying get_block() function know to
         * avoid double accounting
         */
-        if (flag)
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
        /*
         * We need to check for EXT4 here because migrate
@@ -1213,10 +1224,10 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         */
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-                                bh, create, extend_disksize);
+                                              bh, flags);
        } else {
-                retval = ext4_get_blocks_handle(handle, inode, block,
+                retval = ext4_ind_get_blocks(handle, inode, block,
-                                max_blocks, bh, create, extend_disksize);
+                                             max_blocks, bh, flags);
                if (retval > 0 && buffer_new(bh)) {
                        /*
@@ -1229,18 +1240,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                }
        }
-        if (flag) {
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
-                /*
-                 * Update reserved blocks/metadata blocks
+        /*
-                 * after successful block allocation
+         * Update reserved blocks/metadata blocks after successful
-                 * which were deferred till now
+         * block allocation which had been deferred till now.
-                 */
+         */
-                if ((retval > 0) && buffer_delay(bh))
+        if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
-                        ext4_da_update_reserve_space(inode, retval);
+                ext4_da_update_reserve_space(inode, retval);
-        }
        up_write((&EXT4_I(inode)->i_data_sem));
+        if (retval > 0 && buffer_mapped(bh)) {
+                int ret = check_block_validity(inode, block,
+                                               bh->b_blocknr, retval);
+                if (ret != 0)
+                        return ret;
+        }
        return retval;
 }
@@ -1268,8 +1284,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
                started = 1;
        }
-        ret = ext4_get_blocks_wrap(handle, inode, iblock,
+        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-                                        max_blocks, bh_result, create, 0, 0);
+                              create ? EXT4_GET_BLOCKS_CREATE : 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1288,17 +1304,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
        struct buffer_head dummy;
        int fatal = 0, err;
+        int flags = 0;
        J_ASSERT(handle != NULL || create == 0);
        dummy.b_state = 0;
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
-        err = ext4_get_blocks_wrap(handle, inode, block, 1,
+        if (create)
-                                        &dummy, create, 1, 0);
+                flags |= EXT4_GET_BLOCKS_CREATE;
+        err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
        /*
-         * ext4_get_blocks_handle() returns number of blocks
+         * ext4_get_blocks() returns number of blocks mapped. 0 in
-         * mapped. 0 in case of a HOLE.
+         * case of a HOLE.
         */
        if (err > 0) {
                if (err > 1)
@@ -1385,8 +1403,7 @@ static int walk_page_buffers(handle_t *handle,
        for (bh = head, block_start = 0;
             ret == 0 && (bh != head || !block_start);
-             block_start = block_end, bh = next)
+             block_start = block_end, bh = next) {
-        {
                next = bh->b_this_page;
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
@@ -1427,7 +1444,7 @@ static int walk_page_buffers(handle_t *handle,
 * write.
 */
 static int do_journal_get_write_access(handle_t *handle,
-                                        struct buffer_head *bh)
+                                       struct buffer_head *bh)
 {
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
@@ -1435,22 +1452,24 @@ static int do_journal_get_write_access(handle_t *handle,
 }
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned flags,
+                            loff_t pos, unsigned len, unsigned flags,
-                                struct page **pagep, void **fsdata)
+                            struct page **pagep, void **fsdata)
 {
        struct inode *inode = mapping->host;
-        int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct page *page;
-        pgoff_t index;
+        pgoff_t index;
        unsigned from, to;
-        trace_mark(ext4_write_begin,
+        trace_ext4_write_begin(inode, pos, len, flags);
-                   "dev %s ino %lu pos %llu len %u flags %u",
+        /*
-                   inode->i_sb->s_id, inode->i_ino,
+         * Reserve one block more for addition to orphan list in case
-                   (unsigned long long) pos, len, flags);
+         * we allocate blocks but write fails for some reason
-        index = pos >> PAGE_CACHE_SHIFT;
+         */
+        needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1483,15 +1502,30 @@ retry:
        if (ret) {
                unlock_page(page);
-                ext4_journal_stop(handle);
                page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
+                 *
+                 * Add inode to orphan list in case we crash before
+                 * truncate finishes
                 */
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext4_can_truncate(inode))
-                        vmtruncate(inode, inode->i_size);
+                        ext4_orphan_add(handle, inode);
+                ext4_journal_stop(handle);
+                if (pos + len > inode->i_size) {
+                        ext4_truncate(inode);
+                        /*
+                         * If truncate failed early the inode might
+                         * still be on the orphan list; we need to
+                         * make sure the inode is removed from the
+                         * orphan list in that case.
+                         */
+                        if (inode->i_nlink)
+                                ext4_orphan_del(NULL, inode);
+                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1509,6 +1543,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
        return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
+static int ext4_generic_write_end(struct file *file,
+                                  struct address_space *mapping,
+                                  loff_t pos, unsigned len, unsigned copied,
+                                  struct page *page, void *fsdata)
+{
+        int i_size_changed = 0;
+        struct inode *inode = mapping->host;
+        handle_t *handle = ext4_journal_current_handle();
+        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold i_mutex.
+         *
+         * But it's important to update i_size while still holding page lock:
+         * page writeout could otherwise come in and zero beyond i_size.
+         */
+        if (pos + copied > inode->i_size) {
+                i_size_write(inode, pos + copied);
+                i_size_changed = 1;
+        }
+        if (pos + copied >  EXT4_I(inode)->i_disksize) {
+                /* We need to mark inode dirty even if
+                 * new_i_size is less that inode->i_size
+                 * bu greater than i_disksize.(hint delalloc)
+                 */
+                ext4_update_i_disksize(inode, (pos + copied));
+                i_size_changed = 1;
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        /*
+         * Don't mark the inode dirty under page lock. First, it unnecessarily
+         * makes the holding time of page lock longer. Second, it forces lock
+         * ordering of page lock and transaction start for journaling
+         * filesystems.
+         */
+        if (i_size_changed)
+                ext4_mark_inode_dirty(handle, inode);
+        return copied;
+}
 /*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
@@ -1517,36 +1597,27 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 * buffers are managed internally.
 */
 static int ext4_ordered_write_end(struct file *file,
-                                struct address_space *mapping,
+                                  struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
+                                  loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
+                                  struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
-        trace_mark(ext4_ordered_write_end,
+        trace_ext4_ordered_write_end(inode, pos, len, copied);
-                   "dev %s ino %lu pos %llu len %u copied %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, copied);
        ret = ext4_jbd2_file_inode(handle, inode);
        if (ret == 0) {
-                loff_t new_i_size;
+                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-                new_i_size = pos + copied;
-                if (new_i_size > EXT4_I(inode)->i_disksize) {
-                        ext4_update_i_disksize(inode, new_i_size);
-                        /* We need to mark inode dirty even if
-                         * new_i_size is less that inode->i_size
-                         * bu greater than i_disksize.(hint delalloc)
-                         */
-                        ext4_mark_inode_dirty(handle, inode);
-                }
-                ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
+                if (pos + len > inode->i_size && ext4_can_truncate(inode))
+                        /* if we have allocated more blocks and copied
+                         * less. We will have blocks allocated outside
+                         * inode->i_size. So truncate them
+                         */
+                        ext4_orphan_add(handle, inode);
                if (ret2 < 0)
                        ret = ret2;
        }
@@ -1554,36 +1625,41 @@ static int ext4_ordered_write_end(struct file *file,
        if (!ret)
                ret = ret2;
+        if (pos + len > inode->i_size) {
+                ext4_truncate(inode);
+                /*
+                 * If truncate failed early the inode might still be
+                 * on the orphan list; we need to make sure the inode
+                 * is removed from the orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
        return ret ? ret : copied;
 }
 static int ext4_writeback_write_end(struct file *file,
-                                struct address_space *mapping,
+                                    struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
+                                    loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
+                                    struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
-        loff_t new_i_size;
-        trace_mark(ext4_writeback_write_end,
+        trace_ext4_writeback_write_end(inode, pos, len, copied);
-                   "dev %s ino %lu pos %llu len %u copied %u",
+        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, copied);
-        new_i_size = pos + copied;
-        if (new_i_size > EXT4_I(inode)->i_disksize) {
-                ext4_update_i_disksize(inode, new_i_size);
-                /* We need to mark inode dirty even if
-                 * new_i_size is less that inode->i_size
-                 * bu greater than i_disksize.(hint delalloc)
-                 */
-                ext4_mark_inode_dirty(handle, inode);
-        }
-        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
+        if (pos + len > inode->i_size && ext4_can_truncate(inode))
+                /* if we have allocated more blocks and copied
+                 * less. We will have blocks allocated outside
+                 * inode->i_size. So truncate them
+                 */
+                ext4_orphan_add(handle, inode);
        if (ret2 < 0)
                ret = ret2;
@@ -1591,13 +1667,24 @@ static int ext4_writeback_write_end(struct file *file,
        if (!ret)
                ret = ret2;
+        if (pos + len > inode->i_size) {
+                ext4_truncate(inode);
+                /*
+                 * If truncate failed early the inode might still be
+                 * on the orphan list; we need to make sure the inode
+                 * is removed from the orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
        return ret ? ret : copied;
 }
 static int ext4_journalled_write_end(struct file *file,
-                                struct address_space *mapping,
+                                     struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
+                                     loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
+                                     struct page *page, void *fsdata)
 {
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
@@ -1606,10 +1693,7 @@ static int ext4_journalled_write_end(struct file *file,
        unsigned from, to;
        loff_t new_i_size;
-        trace_mark(ext4_journalled_write_end,
+        trace_ext4_journalled_write_end(inode, pos, len, copied);
-                   "dev %s ino %lu pos %llu len %u copied %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, copied);
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1635,10 +1719,27 @@ static int ext4_journalled_write_end(struct file *file,
        }
        unlock_page(page);
+        page_cache_release(page);
+        if (pos + len > inode->i_size && ext4_can_truncate(inode))
+                /* if we have allocated more blocks and copied
+                 * less. We will have blocks allocated outside
+                 * inode->i_size. So truncate them
+                 */
+                ext4_orphan_add(handle, inode);
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        page_cache_release(page);
+        if (pos + len > inode->i_size) {
+                ext4_truncate(inode);
+                /*
+                 * If truncate failed early the inode might still be
+                 * on the orphan list; we need to make sure the inode
+                 * is removed from the orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
        return ret ? ret : copied;
 }
@@ -1738,7 +1839,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 }
 static void ext4_da_page_release_reservation(struct page *page,
-                                                unsigned long offset)
+                                             unsigned long offset)
 {
        int to_release = 0;
        struct buffer_head *head, *bh;
@@ -1852,7 +1953,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 * @logical - first logical block to start assignment with
 *
 * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay
+ * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
 */
 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                 struct buffer_head *exbh)
@@ -1902,16 +2003,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                        do {
                                if (cur_logical >= logical + blocks)
                                        break;
-                                if (buffer_delay(bh)) {
-                                        bh->b_blocknr = pblock;
+                                if (buffer_delay(bh) ||
-                                        clear_buffer_delay(bh);
+                                                buffer_unwritten(bh)) {
-                                        bh->b_bdev = inode->i_sb->s_bdev;
-                                } else if (buffer_unwritten(bh)) {
+                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
-                                        bh->b_blocknr = pblock;
-                                        clear_buffer_unwritten(bh);
+                                        if (buffer_delay(bh)) {
-                                        set_buffer_mapped(bh);
+                                                clear_buffer_delay(bh);
-                                        set_buffer_new(bh);
+                                                bh->b_blocknr = pblock;
-                                        bh->b_bdev = inode->i_sb->s_bdev;
+                                        } else {
+                                                /*
+                                                 * unwritten already should have
+                                                 * blocknr assigned. Verify that
+                                                 */
+                                                clear_buffer_unwritten(bh);
+                                                BUG_ON(bh->b_blocknr != pblock);
+                                        }
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
@@ -1990,51 +2099,6 @@ static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
-#define         EXT4_DELALLOC_RSVED     1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-                                   struct buffer_head *bh_result, int create)
-{
-        int ret;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        loff_t disksize = EXT4_I(inode)->i_disksize;
-        handle_t *handle = NULL;
-        handle = ext4_journal_current_handle();
-        BUG_ON(!handle);
-        ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                                   bh_result, create, 0, EXT4_DELALLOC_RSVED);
-        if (ret <= 0)
-                return ret;
-        bh_result->b_size = (ret << inode->i_blkbits);
-        if (ext4_should_order_data(inode)) {
-                int retval;
-                retval = ext4_jbd2_file_inode(handle, inode);
-                if (retval)
-                        /*
-                         * Failed to add inode for ordered mode. Don't
-                         * update file size
-                         */
-                        return retval;
-        }
-        /*
-         * Update on-disk size along with block allocation we don't
-         * use 'extend_disksize' as size may change within already
-         * allocated block -bzzz
-         */
-        disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-        if (disksize > i_size_read(inode))
-                disksize = i_size_read(inode);
-        if (disksize > EXT4_I(inode)->i_disksize) {
-                ext4_update_i_disksize(inode, disksize);
-                ret = ext4_mark_inode_dirty(handle, inode);
-                return ret;
-        }
-        return 0;
-}
 /*
 * mpage_da_map_blocks - go through given space
 *
@@ -2045,29 +2109,57 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-        int err = 0;
+        int err, blks, get_blocks_flags;
        struct buffer_head new;
-        sector_t next;
+        sector_t next = mpd->b_blocknr;
+        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+        handle_t *handle = NULL;
        /*
         * We consider only non-mapped and non-allocated blocks
         */
        if ((mpd->b_state  & (1 << BH_Mapped)) &&
-            !(mpd->b_state & (1 << BH_Delay)))
+                !(mpd->b_state & (1 << BH_Delay)) &&
+                !(mpd->b_state & (1 << BH_Unwritten)))
                return 0;
-        new.b_state = mpd->b_state;
-        new.b_blocknr = 0;
-        new.b_size = mpd->b_size;
-        next = mpd->b_blocknr;
        /*
-         * If we didn't accumulate anything
+         * If we didn't accumulate anything to write simply return
-         * to write simply return
         */
-        if (!new.b_size)
+        if (!mpd->b_size)
                return 0;
-        err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+        handle = ext4_journal_current_handle();
-        if (err) {
+        BUG_ON(!handle);
+        /*
+         * Call ext4_get_blocks() to allocate any delayed allocation
+         * blocks, or to convert an uninitialized extent to be
+         * initialized (in the case where we have written into
+         * one or more preallocated blocks).
+         *
+         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+         * indicate that we are on the delayed allocation path.  This
+         * affects functions in many different parts of the allocation
+         * call path.  This flag exists primarily because we don't
+         * want to change *many* call functions, so ext4_get_blocks()
+         * will set the magic i_delalloc_reserved_flag once the
+         * inode's allocation semaphore is taken.
+         *
+         * If the blocks in questions were delalloc blocks, set
+         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+         * variables are updated after the blocks have been allocated.
+         */
+        new.b_state = 0;
+        get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+                            EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+        if (mpd->b_state & (1 << BH_Delay))
+                get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+                               &new, get_blocks_flags);
+        if (blks < 0) {
+                err = blks;
                /*
                 * If get block returns with error we simply
                 * return. Later writepage will redirty the page and
@@ -2100,12 +2192,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                if (err == -ENOSPC) {
                        ext4_print_free_blocks(mpd->inode);
                }
-                /* invlaidate all the pages */
+                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
                return err;
        }
-        BUG_ON(new.b_size == 0);
+        BUG_ON(blks == 0);
+        new.b_size = (blks << mpd->inode->i_blkbits);
        if (buffer_new(&new))
                __unmap_underlying_blocks(mpd->inode, &new);
@@ -2118,6 +2212,23 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
            (mpd->b_state & (1 << BH_Unwritten)))
                mpage_put_bnr_to_bhs(mpd, next, &new);
+        if (ext4_should_order_data(mpd->inode)) {
+                err = ext4_jbd2_file_inode(handle, mpd->inode);
+                if (err)
+                        return err;
+        }
+        /*
+         * Update on-disk size along with block allocation.
+         */
+        disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+        if (disksize > i_size_read(mpd->inode))
+                disksize = i_size_read(mpd->inode);
+        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+                ext4_update_i_disksize(mpd->inode, disksize);
+                return ext4_mark_inode_dirty(handle, mpd->inode);
+        }
        return 0;
 }
@@ -2192,6 +2303,11 @@ flush_it:
        return;
 }
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+{
+        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+}
 /*
 * __mpage_da_writepage - finds extent of pages and blocks
 *
@@ -2274,10 +2390,9 @@ static int __mpage_da_writepage(struct page *page,
                         * We need to try to allocate
                         * unmapped blocks in the same page.
                         * Otherwise we won't make progress
-                         * with the page in ext4_da_writepage
+                         * with the page in ext4_writepage
                         */
-                        if (buffer_dirty(bh) &&
+                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                            (!buffer_mapped(bh) || buffer_delay(bh))) {
                                mpage_add_bh_to_extent(mpd, logical,
                                                       bh->b_size,
                                                       bh->b_state);
@@ -2303,8 +2418,16 @@ static int __mpage_da_writepage(struct page *page,
 }
 /*
- * this is a special callback for ->write_begin() only
+ * This is a special get_blocks_t callback which is used by
- * it's intention is to return mapped block or reserve space
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
 */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                                  struct buffer_head *bh_result, int create)
@@ -2323,7 +2446,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+        ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
        if ((ret == 0) && !buffer_delay(bh_result)) {
                /* the block isn't (pre)allocated yet, let's reserve space */
                /*
@@ -2340,40 +2463,52 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                set_buffer_delay(bh_result);
        } else if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
-                /*
+                if (buffer_unwritten(bh_result)) {
-                 * With sub-block writes into unwritten extents
+                        /* A delayed write to unwritten bh should
-                 * we also need to mark the buffer as new so that
+                         * be marked new and mapped.  Mapped ensures
-                 * the unwritten parts of the buffer gets correctly zeroed.
+                         * that we don't do get_block multiple times
-                 */
+                         * when we write to the same offset and new
-                if (buffer_unwritten(bh_result))
+                         * ensures that we do proper zero out for
+                         * partial write.
+                         */
                        set_buffer_new(bh_result);
+                        set_buffer_mapped(bh_result);
+                }
                ret = 0;
        }
        return ret;
 }
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+/*
-{
+ * This function is used as a standard get_block_t calback function
-        /*
+ * when there is no desire to allocate any blocks.  It is used as a
-         * unmapped buffer is possible for holes.
+ * callback function for block_prepare_write(), nobh_writepage(), and
-         * delay buffer is possible with delayed allocation
+ * block_write_full_page().  These functions should only try to map a
-         */
+ * single block at a time.
-        return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+ *
-}
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
-static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling nobh_writepage() or
+ * block_write_full_page().  Otherwise, b_blocknr could be left
+ * unitialized, and the page write functions will be taken by
+ * surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
        int ret = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
        /*
         * we don't want to do block allocation in writepage
         * so call get_block_wrap with create = 0
         */
-        ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-                                   bh_result, 0, 0, 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -2381,14 +2516,102 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
        return ret;
 }
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+        get_bh(bh);
+        return 0;
+}
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+        put_bh(bh);
+        return 0;
+}
+static int __ext4_journalled_writepage(struct page *page,
+                                       struct writeback_control *wbc,
+                                       unsigned int len)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        struct buffer_head *page_bufs;
+        handle_t *handle = NULL;
+        int ret = 0;
+        int err;
+        page_bufs = page_buffers(page);
+        BUG_ON(!page_bufs);
+        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+        /* As soon as we unlock the page, it can go away, but we have
+         * references to buffers so we are safe */
+        unlock_page(page);
+        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                do_journal_get_write_access);
+        err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                write_end_fn);
+        if (ret == 0)
+                ret = err;
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+out:
+        return ret;
+}
 /*
- * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * Note that we don't need to start a transaction unless we're journaling data
- * get called via journal_submit_inode_data_buffers (no journal handle)
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * get called via shrink_page_list via pdflush (no journal handle)
+ * need to file the inode to the transaction's list in ordered mode because if
- * or grab_page_cache when doing write_begin (have journal handle)
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
+ * This function can get called via...
+ *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
+ *   - shrink_page_list via pdflush (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
+ *
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
+ *
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
+ *
+ *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *              ext4_writepage()
+ *
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
 */
-static int ext4_da_writepage(struct page *page,
+static int ext4_writepage(struct page *page,
-                                struct writeback_control *wbc)
+                          struct writeback_control *wbc)
 {
        int ret = 0;
        loff_t size;
@@ -2396,9 +2619,7 @@ static int ext4_da_writepage(struct page *page,
        struct buffer_head *page_bufs;
        struct inode *inode = page->mapping->host;
-        trace_mark(ext4_da_writepage,
+        trace_ext4_writepage(inode, page);
-                   "dev %s ino %lu page_index %lu",
-                   inode->i_sb->s_id, inode->i_ino, page->index);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2408,7 +2629,7 @@ static int ext4_da_writepage(struct page *page,
        if (page_has_buffers(page)) {
                page_bufs = page_buffers(page);
                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay)) {
+                                        ext4_bh_delay_or_unwritten)) {
                        /*
                         * We don't want to do  block allocation
                         * So redirty the page and return
@@ -2435,13 +2656,13 @@ static int ext4_da_writepage(struct page *page,
                 * all are mapped and non delay. We don't want to
                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                ret = block_prepare_write(page, 0, len,
-                                                ext4_normal_get_block_write);
+                                          noalloc_get_block_write);
                if (!ret) {
                        page_bufs = page_buffers(page);
                        /* check whether all are mapped and non delay */
                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                                ext4_bh_unmapped_or_delay)) {
+                                                ext4_bh_delay_or_unwritten)) {
                                redirty_page_for_writepage(wbc, page);
                                unlock_page(page);
                                return 0;
@@ -2457,15 +2678,23 @@ static int ext4_da_writepage(struct page *page,
                        return 0;
                }
                /* now mark the buffer_heads as dirty and uptodate */
-                block_commit_write(page, 0, PAGE_CACHE_SIZE);
+                block_commit_write(page, 0, len);
+        }
+        if (PageChecked(page) && ext4_should_journal_data(inode)) {
+                /*
+                 * It's mmapped pagecache.  Add buffers and journal it.  There
+                 * doesn't seem much point in redirtying the page here.
+                 */
+                ClearPageChecked(page);
+                return __ext4_journalled_writepage(page, wbc, len);
        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-                ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+                ret = nobh_writepage(page, noalloc_get_block_write, wbc);
        else
-                ret = block_write_full_page(page,
+                ret = block_write_full_page(page, noalloc_get_block_write,
-                                                ext4_normal_get_block_write,
+                                            wbc);
-                                                wbc);
        return ret;
 }
@@ -2510,19 +2739,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        int needed_blocks, ret = 0, nr_to_writebump = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
-        trace_mark(ext4_da_writepages,
+        trace_ext4_da_writepages(inode, wbc);
-                   "dev %s ino %lu nr_t_write %ld "
-                   "pages_skipped %ld range_start %llu "
-                   "range_end %llu nonblocking %d "
-                   "for_kupdate %d for_reclaim %d "
-                   "for_writepages %d range_cyclic %d",
-                   inode->i_sb->s_id, inode->i_ino,
-                   wbc->nr_to_write, wbc->pages_skipped,
-                   (unsigned long long) wbc->range_start,
-                   (unsigned long long) wbc->range_end,
-                   wbc->nonblocking, wbc->for_kupdate,
-                   wbc->for_reclaim, wbc->for_writepages,
-                   wbc->range_cyclic);
        /*
         * No pages to write? This is mainly a kludge to avoid starting
@@ -2536,13 +2753,13 @@ static int ext4_da_writepages(struct address_space *mapping,
         * If the filesystem has aborted, it is read-only, so return
         * right away instead of dumping stack traces later on that
         * will obscure the real source of the problem.  We test
-         * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+         * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
         * the latter could be true if the filesystem is mounted
         * read-only, and in that case, ext4_da_writepages should
         * *never* be called, so if that ever happens, we would want
         * the stack trace.
         */
-        if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+        if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
                return -EROFS;
        /*
@@ -2688,14 +2905,7 @@ out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
-        trace_mark(ext4_da_writepage_result,
+        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
-                   "dev %s ino %lu ret %d pages_written %d "
-                   "pages_skipped %ld congestion %d "
-                   "more_io %d no_nrwrite_index_update %d",
-                   inode->i_sb->s_id, inode->i_ino, ret,
-                   pages_written, wbc->pages_skipped,
-                   wbc->encountered_congestion, wbc->more_io,
-                   wbc->no_nrwrite_index_update);
        return ret;
 }
@@ -2727,8 +2937,8 @@ static int ext4_nonda_switch(struct super_block *sb)
 }
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned flags,
+                               loff_t pos, unsigned len, unsigned flags,
-                                struct page **pagep, void **fsdata)
+                               struct page **pagep, void **fsdata)
 {
        int ret, retries = 0;
        struct page *page;
@@ -2747,11 +2957,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                        len, flags, pagep, fsdata);
        }
        *fsdata = (void *)0;
+        trace_ext4_da_write_begin(inode, pos, len, flags);
-        trace_mark(ext4_da_write_begin,
-                   "dev %s ino %lu pos %llu len %u flags %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, flags);
 retry:
        /*
         * With delayed allocation, we don't log the i_disksize update
@@ -2777,7 +2983,7 @@ retry:
        *pagep = page;
        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-                                                        ext4_da_get_block_prep);
+                                ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
@@ -2788,7 +2994,7 @@ retry:
                 * i_size_read because we hold i_mutex.
                 */
                if (pos + len > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
+                        ext4_truncate(inode);
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2802,7 +3008,7 @@ out:
 * when write to the end of file but not require block allocation
 */
 static int ext4_da_should_update_i_disksize(struct page *page,
-                                         unsigned long offset)
+                                            unsigned long offset)
 {
        struct buffer_head *bh;
        struct inode *inode = page->mapping->host;
@@ -2815,15 +3021,15 @@ static int ext4_da_should_update_i_disksize(struct page *page,
        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;
-        if (!buffer_mapped(bh) || (buffer_delay(bh)))
+        if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
                return 0;
        return 1;
 }
 static int ext4_da_write_end(struct file *file,
-                                struct address_space *mapping,
+                             struct address_space *mapping,
-                                loff_t pos, unsigned len, unsigned copied,
+                             loff_t pos, unsigned len, unsigned copied,
-                                struct page *page, void *fsdata)
+                             struct page *page, void *fsdata)
 {
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
@@ -2844,10 +3050,7 @@ static int ext4_da_write_end(struct file *file,
                }
        }
-        trace_mark(ext4_da_write_end,
+        trace_ext4_da_write_end(inode, pos, len, copied);
-                   "dev %s ino %lu pos %llu len %u copied %u",
-                   inode->i_sb->s_id, inode->i_ino,
-                   (unsigned long long) pos, len, copied);
        start = pos & (PAGE_CACHE_SIZE - 1);
        end = start + copied - 1;
@@ -2924,7 +3127,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * not strictly speaking necessary (and for users of
         * laptop_mode, not even desirable).  However, to do otherwise
         * would require replicating code paths in:
-         * 
+         *
         * ext4_da_writepages() ->
         *    write_cache_pages() ---> (via passed in callback function)
         *        __mpage_da_writepage() -->
@@ -2944,7 +3147,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
-         * 
+         *
         * For now, though, we'll cheat by calling filemap_flush(),
         * which will map the blocks, and start the I/O, but not
         * actually wait for the I/O to complete.
@@ -3014,229 +3217,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping, block, ext4_get_block);
 }
-static int bget_one(handle_t *handle, struct buffer_head *bh)
-{
-        get_bh(bh);
-        return 0;
-}
-static int bput_one(handle_t *handle, struct buffer_head *bh)
-{
-        put_bh(bh);
-        return 0;
-}
-/*
- * Note that we don't need to start a transaction unless we're journaling data
- * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * need to file the inode to the transaction's list in ordered mode because if
- * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
- * transaction the data will hit the disk. In case we are journaling data, we
- * cannot start transaction directly because transaction start ranks above page
- * lock so we have to do some magic.
- *
- * In all journaling modes block_write_full_page() will start the I/O.
- *
- * Problem:
- *
- *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- *              ext4_writepage()
- *
- * Similar for:
- *
- *      ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext4_get_block().  We will deadlock on various things like
- * lock_journal and i_data_sem
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- *          non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- *   In journalled data mode, a data buffer may be metadata against the
- *   current transaction.  But the same file is part of a shared mapping
- *   and someone does a writepage() on it.
- *
- *   We will move the buffer onto the async_data list, but *after* it has
- *   been dirtied. So there's a small window where we have dirty data on
- *   BJ_Metadata.
- *
- *   Note that this only applies to the last partial page in the file.  The
- *   bit which block_write_full_page() uses prepare/commit for.  (That's
- *   broken code anyway: it's wrong for msync()).
- *
- *   It's a rare case: affects the final partial page, for journalled data
- *   where the file is subject to bith write() and writepage() in the same
- *   transction.  To fix it we'll need a custom block_write_full_page().
- *   We'll probably need that anyway for journalling writepage() output.
- *
- * We don't honour synchronous mounts for writepage().  That would be
- * disastrous.  Any write() or metadata operation will sync the fs for
- * us.
- *
- */
-static int __ext4_normal_writepage(struct page *page,
-                                struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        if (test_opt(inode->i_sb, NOBH))
-                return nobh_writepage(page,
-                                        ext4_normal_get_block_write, wbc);
-        else
-                return block_write_full_page(page,
-                                                ext4_normal_get_block_write,
-                                                wbc);
-}
-static int ext4_normal_writepage(struct page *page,
-                                struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t size = i_size_read(inode);
-        loff_t len;
-        trace_mark(ext4_normal_writepage,
-                   "dev %s ino %lu page_index %lu",
-                   inode->i_sb->s_id, inode->i_ino, page->index);
-        J_ASSERT(PageLocked(page));
-        if (page->index == size >> PAGE_CACHE_SHIFT)
-                len = size & ~PAGE_CACHE_MASK;
-        else
-                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
-                /* if page has buffers it should all be mapped
-                 * and allocated. If there are not buffers attached
-                 * to the page we know the page is dirty but it lost
-                 * buffers. That means that at some moment in time
-                 * after write_begin() / write_end() has been called
-                 * all buffers have been clean and thus they must have been
-                 * written at least once. So they are all mapped and we can
-                 * happily proceed with mapping them and writing the page.
-                 */
-                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay));
-        }
-        if (!ext4_journal_current_handle())
-                return __ext4_normal_writepage(page, wbc);
-        redirty_page_for_writepage(wbc, page);
-        unlock_page(page);
-        return 0;
-}
-static int __ext4_journalled_writepage(struct page *page,
-                                struct writeback_control *wbc)
-{
-        struct address_space *mapping = page->mapping;
-        struct inode *inode = mapping->host;
-        struct buffer_head *page_bufs;
-        handle_t *handle = NULL;
-        int ret = 0;
-        int err;
-        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                        ext4_normal_get_block_write);
-        if (ret != 0)
-                goto out_unlock;
-        page_bufs = page_buffers(page);
-        walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
-                                                                bget_one);
-        /* As soon as we unlock the page, it can go away, but we have
-         * references to buffers so we are safe */
-        unlock_page(page);
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                goto out;
-        }
-        ret = walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-        err = walk_page_buffers(handle, page_bufs, 0,
-                                PAGE_CACHE_SIZE, NULL, write_end_fn);
-        if (ret == 0)
-                ret = err;
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-        walk_page_buffers(handle, page_bufs, 0,
-                                PAGE_CACHE_SIZE, NULL, bput_one);
-        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-        goto out;
-out_unlock:
-        unlock_page(page);
-out:
-        return ret;
-}
-static int ext4_journalled_writepage(struct page *page,
-                                struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t size = i_size_read(inode);
-        loff_t len;
-        trace_mark(ext4_journalled_writepage,
-                   "dev %s ino %lu page_index %lu",
-                   inode->i_sb->s_id, inode->i_ino, page->index);
-        J_ASSERT(PageLocked(page));
-        if (page->index == size >> PAGE_CACHE_SHIFT)
-                len = size & ~PAGE_CACHE_MASK;
-        else
-                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
-                /* if page has buffers it should all be mapped
-                 * and allocated. If there are not buffers attached
-                 * to the page we know the page is dirty but it lost
-                 * buffers. That means that at some moment in time
-                 * after write_begin() / write_end() has been called
-                 * all buffers have been clean and thus they must have been
-                 * written at least once. So they are all mapped and we can
-                 * happily proceed with mapping them and writing the page.
-                 */
-                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay));
-        }
-        if (ext4_journal_current_handle())
-                goto no_write;
-        if (PageChecked(page)) {
-                /*
-                 * It's mmapped pagecache.  Add buffers and journal it.  There
-                 * doesn't seem much point in redirtying the page here.
-                 */
-                ClearPageChecked(page);
-                return __ext4_journalled_writepage(page, wbc);
-        } else {
-                /*
-                 * It may be a page full of checkpoint-mode buffers.  We don't
-                 * really know unless we go poke around in the buffer_heads.
-                 * But block_write_full_page will do the right thing.
-                 */
-                return block_write_full_page(page,
-                                                ext4_normal_get_block_write,
-                                                wbc);
-        }
-no_write:
-        redirty_page_for_writepage(wbc, page);
-        unlock_page(page);
-        return 0;
-}
 static int ext4_readpage(struct file *file, struct page *page)
 {
        return mpage_readpage(page, ext4_get_block);
@@ -3288,8 +3268,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 * VFS code falls back into buffered path in that case so we are safe.
 */
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
-                        const struct iovec *iov, loff_t offset,
+                              const struct iovec *iov, loff_t offset,
-                        unsigned long nr_segs)
+                              unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
@@ -3383,7 +3363,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_normal_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_ordered_write_end,
@@ -3398,7 +3378,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 static const struct address_space_operations ext4_writeback_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_normal_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_writeback_write_end,
@@ -3413,7 +3393,7 @@ static const struct address_space_operations ext4_writeback_aops = {
 static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_journalled_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
@@ -3427,7 +3407,7 @@ static const struct address_space_operations ext4_journalled_aops = {
 static const struct address_space_operations ext4_da_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_da_writepage,
+        .writepage              = ext4_writepage,
        .writepages             = ext4_da_writepages,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_da_write_begin,
@@ -3474,7 +3454,8 @@ int ext4_block_truncate_page(handle_t *handle,
        struct page *page;
        int err = 0;
-        page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
        if (!page)
                return -EINVAL;
@@ -3609,7 +3590,8 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *                      (no partially truncated stuff there).  */
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
-                        ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
+                                  ext4_lblk_t offsets[4], Indirect chain[4],
+                                  __le32 *top)
 {
        Indirect *partial, *p;
        int k, err;
@@ -3665,8 +3647,10 @@ no_top:
 * than `count' because there can be holes in there.
 */
 static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
-                struct buffer_head *bh, ext4_fsblk_t block_to_free,
+                              struct buffer_head *bh,
-                unsigned long count, __le32 *first, __le32 *last)
+                              ext4_fsblk_t block_to_free,
+                              unsigned long count, __le32 *first,
+                              __le32 *last)
 {
        __le32 *p;
        if (try_to_extend_transaction(handle, inode)) {
@@ -3683,10 +3667,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
        }
        /*
-         * Any buffers which are on the journal will be in memory. We find
+         * Any buffers which are on the journal will be in memory. We
-         * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
+         * find them on the hash table so jbd2_journal_revoke() will
-         * on them.  We've already detached each block from the file, so
+         * run jbd2_journal_forget() on them.  We've already detached
-         * bforget() in jbd2_journal_forget() should be safe.
+         * each block from the file, so bforget() in
+         * jbd2_journal_forget() should be safe.
         *
         * AKPM: turn on bforget in jbd2_journal_forget()!!!
         */
@@ -3973,7 +3958,8 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
-        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+        if (ei->i_disksize && inode->i_size == 0 &&
+            !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4057,7 +4043,7 @@ void ext4_truncate(struct inode *inode)
                                   (__le32*)partial->bh->b_data+addr_per_block,
                                   (chain+n-1) - partial);
                BUFFER_TRACE(partial->bh, "call brelse");
-                brelse (partial->bh);
+                brelse(partial->bh);
                partial--;
        }
 do_indirects:
@@ -4298,8 +4284,9 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
        if (flags & S_DIRSYNC)
                ei->i_flags |= EXT4_DIRSYNC_FL;
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
-                                        struct ext4_inode_info *ei)
+                                  struct ext4_inode_info *ei)
 {
        blkcnt_t i_blocks ;
        struct inode *inode = &(ei->vfs_inode);
@@ -4338,10 +4325,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        ei->i_acl = EXT4_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
@@ -4414,7 +4397,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                                        EXT4_GOOD_OLD_INODE_SIZE +
                                        ei->i_extra_isize;
                        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-                                 ei->i_state |= EXT4_STATE_XATTR;
+                                ei->i_state |= EXT4_STATE_XATTR;
                }
        } else
                ei->i_extra_isize = 0;
@@ -4433,7 +4416,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
-            ((ei->i_file_acl < 
+            ((ei->i_file_acl <
              (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
               EXT4_SB(sb)->s_gdb_count)) ||
             (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
@@ -4448,15 +4431,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                     !ext4_inode_is_fast_symlink(inode)))
                        /* Validate extent which is part of inode */
                        ret = ext4_ext_check_inode(inode);
-        } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+        } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                   (S_ISLNK(inode->i_mode) &&
                    !ext4_inode_is_fast_symlink(inode))) {
-                /* Validate block references which are part of inode */
+                /* Validate block references which are part of inode */
                ret = ext4_check_inode_blockref(inode);
        }
        if (ret) {
-                brelse(bh);
+                brelse(bh);
-                goto bad_inode;
+                goto bad_inode;
        }
        if (S_ISREG(inode->i_mode)) {
@@ -4487,7 +4470,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        } else {
                brelse(bh);
                ret = -EIO;
-                ext4_error(inode->i_sb, __func__, 
+                ext4_error(inode->i_sb, __func__,
                           "bogus i_mode (%o) for inode=%lu",
                           inode->i_mode, inode->i_ino);
                goto bad_inode;
@@ -4640,8 +4623,9 @@ static int ext4_do_update_inode(handle_t *handle,
                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        raw_inode->i_block[2] = 0;
                }
-        } else for (block = 0; block < EXT4_N_BLOCKS; block++)
+        } else
-                raw_inode->i_block[block] = ei->i_data[block];
+                for (block = 0; block < EXT4_N_BLOCKS; block++)
+                        raw_inode->i_block[block] = ei->i_data[block];
        raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
        if (ei->i_extra_isize) {
@@ -4715,25 +4699,6 @@ int ext4_write_inode(struct inode *inode, int wait)
        return ext4_force_commit(inode->i_sb);
 }
-int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
-{
-        int err = 0;
-        mark_buffer_dirty(bh);
-        if (inode && inode_needs_sync(inode)) {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                        ext4_error(inode->i_sb, __func__,
-                                   "IO error syncing inode, "
-                                   "inode=%lu, block=%llu",
-                                   inode->i_ino,
-                                   (unsigned long long)bh->b_blocknr);
-                        err = -EIO;
-                }
-        }
-        return err;
-}
 /*
 * ext4_setattr()
 *
@@ -4930,7 +4895,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 */
 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-        int groups, gdpblocks;
+        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+        int gdpblocks;
        int idxblocks;
        int ret = 0;
@@ -4957,8 +4923,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
                groups += nrblocks;
        gdpblocks = groups;
-        if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
+        if (groups > ngroups)
-                groups = EXT4_SB(inode->i_sb)->s_groups_count;
+                groups = ngroups;
        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
@@ -4998,7 +4964,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
@@ -5013,7 +4979,7 @@ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
 * Give this, we know that the caller already has write access to iloc->bh.
 */
 int ext4_mark_iloc_dirty(handle_t *handle,
-                struct inode *inode, struct ext4_iloc *iloc)
+                         struct inode *inode, struct ext4_iloc *iloc)
 {
        int err = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 91e75f7a9e73..7050a9cd04a4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -12,8 +12,8 @@
 #include <linux/capability.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
+#include <linux/file.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -191,7 +191,7 @@ setversion_out:
        case EXT4_IOC_GROUP_EXTEND: {
                ext4_fsblk_t n_blocks_count;
                struct super_block *sb = inode->i_sb;
-                int err, err2;
+                int err, err2=0;
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -204,19 +204,56 @@ setversion_out:
                        return err;
                err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
-                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                if (EXT4_SB(sb)->s_journal) {
-                err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                        err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                }
                if (err == 0)
                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
+        case EXT4_IOC_MOVE_EXT: {
+                struct move_extent me;
+                struct file *donor_filp;
+                int err;
+                if (copy_from_user(&me,
+                        (struct move_extent __user *)arg, sizeof(me)))
+                        return -EFAULT;
+                donor_filp = fget(me.donor_fd);
+                if (!donor_filp)
+                        return -EBADF;
+                if (!capable(CAP_DAC_OVERRIDE)) {
+                        if ((current->real_cred->fsuid != inode->i_uid) ||
+                                !(inode->i_mode & S_IRUSR) ||
+                                !(donor_filp->f_dentry->d_inode->i_mode &
+                                S_IRUSR)) {
+                                fput(donor_filp);
+                                return -EACCES;
+                        }
+                }
+                err = ext4_move_extents(filp, donor_filp, me.orig_start,
+                                        me.donor_start, me.len, &me.moved_len);
+                fput(donor_filp);
+                if (!err)
+                        if (copy_to_user((struct move_extent *)arg,
+                                &me, sizeof(me)))
+                                return -EFAULT;
+                return err;
+        }
        case EXT4_IOC_GROUP_ADD: {
                struct ext4_new_group_data input;
                struct super_block *sb = inode->i_sb;
-                int err, err2;
+                int err, err2=0;
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -230,9 +267,11 @@ setversion_out:
                        return err;
                err = ext4_group_add(sb, &input);
-                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                if (EXT4_SB(sb)->s_journal) {
-                err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                        err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                }
                if (err == 0)
                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677a7984..cd258463e2a9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,8 @@
 */
 #include "mballoc.h"
+#include <trace/events/ext4.h>
 /*
 * MUSTDO:
 *   - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -340,8 +342,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
 #if BITS_PER_LONG == 64
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
        ext4_set_bit(bit, addr);
 }
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-        addr = mb_correct_addr_and_bit(&bit, addr);
-        ext4_set_bit_atomic(lock, bit, addr);
-}
 static inline void mb_clear_bit(int bit, void *addr)
 {
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_clear_bit(bit, addr);
 }
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-        addr = mb_correct_addr_and_bit(&bit, addr);
-        ext4_clear_bit_atomic(lock, bit, addr);
-}
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
        int fix = 0, ret, tmpmax;
@@ -448,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
-        BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                        ext4_fsblk_t blocknr;
@@ -472,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
-        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -669,7 +657,8 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        }
 }
-static void ext4_mb_generate_buddy(struct super_block *sb,
+static noinline_for_stack
+void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -739,6 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 static int ext4_mb_init_cache(struct page *page, char *incore)
 {
+        ext4_group_t ngroups;
        int blocksize;
        int blocks_per_page;
        int groups_per_page;
@@ -757,6 +747,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        inode = page->mapping->host;
        sb = inode->i_sb;
+        ngroups = ext4_get_groups_count(sb);
        blocksize = 1 << inode->i_blkbits;
        blocks_per_page = PAGE_CACHE_SIZE / blocksize;
@@ -780,7 +771,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        for (i = 0; i < groups_per_page; i++) {
                struct ext4_group_desc *desc;
-                if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+                if (first_group + i >= ngroups)
                        break;
                err = -EIO;
@@ -801,17 +792,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        unlock_buffer(bh[i]);
                        continue;
                }
-                spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                ext4_lock_group(sb, first_group + i);
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        ext4_init_block_bitmap(sb, bh[i],
                                                first_group + i, desc);
                        set_bitmap_uptodate(bh[i]);
                        set_buffer_uptodate(bh[i]);
-                        spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                        ext4_unlock_group(sb, first_group + i);
                        unlock_buffer(bh[i]);
                        continue;
                }
-                spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                ext4_unlock_group(sb, first_group + i);
                if (buffer_uptodate(bh[i])) {
                        /*
                         * if not uninit if bh is uptodate,
@@ -852,7 +843,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                struct ext4_group_info *grinfo;
                group = (first_block + i) >> 1;
-                if (group >= EXT4_SB(sb)->s_groups_count)
+                if (group >= ngroups)
                        break;
                /*
@@ -1078,7 +1069,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
        return 0;
 }
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
@@ -1091,15 +1082,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                if (lock)
+                mb_clear_bit(cur, bm);
-                        mb_clear_bit_atomic(lock, cur, bm);
-                else
-                        mb_clear_bit(cur, bm);
                cur++;
        }
 }
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_set_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
@@ -1112,10 +1100,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                if (lock)
+                mb_set_bit(cur, bm);
-                        mb_set_bit_atomic(lock, cur, bm);
-                else
-                        mb_set_bit(cur, bm);
                cur++;
        }
 }
@@ -1131,7 +1116,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        struct super_block *sb = e4b->bd_sb;
        BUG_ON(first + count > (sb->s_blocksize << 3));
-        BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_free_blocks_double(inode, e4b, first, count);
@@ -1212,7 +1197,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
        int ord;
        void *buddy;
-        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        BUG_ON(ex == NULL);
        buddy = mb_find_buddy(e4b, order, &max);
@@ -1276,7 +1261,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
        BUG_ON(e4b->bd_group != ex->fe_group);
-        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_mark_used_double(e4b, start, len);
@@ -1330,8 +1315,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
                e4b->bd_info->bb_counters[ord]++;
        }
-        mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
+        mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
-                        EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
        mb_check_buddy(e4b);
        return ret;
@@ -1497,7 +1481,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
        ext4_mb_check_limits(ac, e4b, 0);
 }
-static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct ext4_free_extent ex = ac->ac_b_ex;
@@ -1524,7 +1509,8 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
        return 0;
 }
-static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                                struct ext4_buddy *e4b)
 {
        ext4_group_t group = ac->ac_g_ex.fe_group;
@@ -1583,7 +1569,8 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 * The routine scans buddy structures (not bitmap!) from given order
 * to max order and tries to find big enough chunk to satisfy the req
 */
-static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
@@ -1626,7 +1613,8 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
 * In order to optimize scanning, caller must pass number of
 * free blocks in the group, so the routine can know upper limit.
 */
-static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
@@ -1685,7 +1673,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 * we try to find stripe-aligned chunks for stripe-size requests
 * XXX should do so at least for multiples of stripe size as well
 */
-static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                                 struct ext4_buddy *e4b)
 {
        struct super_block *sb = ac->ac_sb;
@@ -1726,7 +1715,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        unsigned free, fragments;
        unsigned i, bits;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
-        struct ext4_group_desc *desc;
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        BUG_ON(cr < 0 || cr >= 4);
@@ -1742,10 +1730,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        switch (cr) {
        case 0:
                BUG_ON(ac->ac_2order == 0);
-                /* If this group is uninitialized, skip it initially */
-                desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
-                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-                        return 0;
                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -1788,6 +1772,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
        int block, pnum;
        int blocks_per_page;
        int groups_per_page;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t first_group;
        struct ext4_group_info *grp;
@@ -1807,7 +1792,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
        /* read all groups the page covers into the cache */
        for (i = 0; i < groups_per_page; i++) {
-                if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+                if ((first_group + i) >= ngroups)
                        break;
                grp = ext4_get_group_info(sb, first_group + i);
                /* take all groups write allocation
@@ -1852,7 +1837,8 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
 }
-static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
        int ret;
@@ -1945,8 +1931,7 @@ err:
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-        ext4_group_t group;
+        ext4_group_t ngroups, group, i;
-        ext4_group_t i;
        int cr;
        int err = 0;
        int bsbits;
@@ -1957,6 +1942,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
+        ngroups = ext4_get_groups_count(sb);
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
        /* first, try the goal */
@@ -2017,11 +2003,11 @@ repeat:
                 */
                group = ac->ac_g_ex.fe_group;
-                for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+                for (i = 0; i < ngroups; group++, i++) {
                        struct ext4_group_info *grp;
                        struct ext4_group_desc *desc;
-                        if (group == EXT4_SB(sb)->s_groups_count)
+                        if (group == ngroups)
                                group = 0;
                        /* quick check to skip empty groups */
@@ -2064,9 +2050,7 @@ repeat:
                        ac->ac_groups_scanned++;
                        desc = ext4_get_group_desc(sb, group, NULL);
-                        if (cr == 0 || (desc->bg_flags &
+                        if (cr == 0)
-                                        cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
-                                        ac->ac_2order != 0))
                                ext4_mb_simple_scan_group(ac, &e4b);
                        else if (cr == 1 &&
                                        ac->ac_g_ex.fe_len == sbi->s_stripe)
@@ -2315,12 +2299,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
        struct super_block *sb = seq->private;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
-        if (*pos < 0 || *pos >= sbi->s_groups_count)
+        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
 }
@@ -2328,11 +2310,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        struct super_block *sb = seq->private;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
        ++*pos;
-        if (*pos < 0 || *pos >= sbi->s_groups_count)
+        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
@@ -2420,7 +2401,8 @@ static void ext4_mb_history_release(struct super_block *sb)
        if (sbi->s_proc != NULL) {
                remove_proc_entry("mb_groups", sbi->s_proc);
-                remove_proc_entry("mb_history", sbi->s_proc);
+                if (sbi->s_mb_history_max)
+                        remove_proc_entry("mb_history", sbi->s_proc);
        }
        kfree(sbi->s_mb_history);
 }
@@ -2431,17 +2413,17 @@ static void ext4_mb_history_init(struct super_block *sb)
        int i;
        if (sbi->s_proc != NULL) {
-                proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+                if (sbi->s_mb_history_max)
-                                 &ext4_mb_seq_history_fops, sb);
+                        proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+                                         &ext4_mb_seq_history_fops, sb);
                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
                                 &ext4_mb_seq_groups_fops, sb);
        }
-        sbi->s_mb_history_max = 1000;
        sbi->s_mb_history_cur = 0;
        spin_lock_init(&sbi->s_mb_history_lock);
        i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-        sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+        sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
        /* if we can't allocate history, then we simple won't use it */
 }
@@ -2451,7 +2433,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_mb_history h;
-        if (unlikely(sbi->s_mb_history == NULL))
+        if (sbi->s_mb_history == NULL)
                return;
        if (!(ac->ac_op & sbi->s_mb_history_filter))
@@ -2587,6 +2569,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
 static int ext4_mb_init_backend(struct super_block *sb)
 {
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int metalen;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2581,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        struct ext4_group_desc *desc;
        /* This is the number of blocks used by GDT */
-        num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
                                1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
        /*
@@ -2644,7 +2627,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        for (i = 0; i < num_meta_group_infos; i++) {
                if ((i + 1) == num_meta_group_infos)
                        metalen = sizeof(*meta_group_info) *
-                                (sbi->s_groups_count -
+                                (ngroups -
                                        (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
                meta_group_info = kmalloc(metalen, GFP_KERNEL);
                if (meta_group_info == NULL) {
@@ -2655,7 +2638,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                sbi->s_group_info[i] = meta_group_info;
        }
-        for (i = 0; i < sbi->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
@@ -2761,7 +2744,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        return 0;
 }
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 {
        struct ext4_prealloc_space *pa;
@@ -2781,13 +2764,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 int ext4_mb_release(struct super_block *sb)
 {
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        if (sbi->s_group_info) {
-                for (i = 0; i < sbi->s_groups_count; i++) {
+                for (i = 0; i < ngroups; i++) {
                        grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
                        kfree(grinfo->bb_bitmap);
@@ -2797,7 +2781,7 @@ int ext4_mb_release(struct super_block *sb)
                        ext4_unlock_group(sb, i);
                        kfree(grinfo);
                }
-                num_meta_group_infos = (sbi->s_groups_count +
+                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                for (i = 0; i < num_meta_group_infos; i++)
@@ -2882,9 +2866,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
                        + entry->start_blk
                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-                trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
+                trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
-                           sb->s_id, (unsigned long long) discard_block,
+                                          entry->count);
-                           entry->count);
                sb_issue_discard(sb, discard_block, entry->count);
                kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2926,7 +2909,11 @@ int __init init_ext4_mballoc(void)
 void exit_ext4_mballoc(void)
 {
-        /* XXX: synchronize_rcu(); */
+        /* 
+         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+         * before destroying the slab cache.
+         */
+        rcu_barrier();
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
@@ -2984,27 +2971,25 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                + le32_to_cpu(es->s_first_data_block);
        len = ac->ac_b_ex.fe_len;
-        if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
+        if (!ext4_data_block_valid(sbi, block, len)) {
-            in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
-            in_range(block, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group) ||
-            in_range(block + len - 1, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group)) {
                ext4_error(sb, __func__,
-                           "Allocating block %llu in system zone of %d group\n",
+                           "Allocating blocks %llu-%llu which overlap "
-                           block, ac->ac_b_ex.fe_group);
+                           "fs metadata\n", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
                 * We leak some of the blocks here.
                 */
-                mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
+                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
-                                bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+                mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-                                ac->ac_b_ex.fe_len);
+                            ac->ac_b_ex.fe_len);
+                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
                        err = -EAGAIN;
                goto out_err;
        }
+        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
        {
                int i;
@@ -3014,9 +2999,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                }
        }
 #endif
-        spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
-        mb_set_bits(NULL, bitmap_bh->b_data,
-                                ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                ext4_free_blks_set(sb, gdp,
@@ -3026,7 +3009,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
        ext4_free_blks_set(sb, gdp, len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
-        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
        /*
         * Now reduce the dirty block count also. Should not go negative
@@ -3459,7 +3443,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 * the function goes through all block freed in the group
 * but not yet committed and marks them used in in-core bitmap.
 * buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with the ext4 group lock held
 */
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group)
@@ -3473,9 +3457,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
        while (n) {
                entry = rb_entry(n, struct ext4_free_data, node);
-                mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                mb_set_bits(bitmap, entry->start_blk, entry->count);
-                                bitmap, entry->start_blk,
-                                entry->count);
                n = rb_next(n);
        }
        return;
@@ -3484,9 +3466,10 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 /*
 * the function goes through all preallocation in this group and marks them
 * used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
 */
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+static noinline_for_stack
+void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -3516,8 +3499,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                if (unlikely(len == 0))
                        continue;
                BUG_ON(groupnr != group);
-                mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                mb_set_bits(bitmap, start, len);
-                                                bitmap, start, len);
                preallocated += len;
                count++;
        }
@@ -3658,10 +3640,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-        trace_mark(ext4_mb_new_inode_pa,
+        trace_ext4_mb_new_inode_pa(ac, pa);
-                   "dev %s ino %lu pstart %llu len %u lstart %u",
-                   sb->s_id, ac->ac_inode->i_ino,
-                   pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        ext4_mb_use_inode_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3720,9 +3699,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        pa->pa_type = MB_GROUP_PA;
        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
-                 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-        trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
+        trace_ext4_mb_new_group_pa(ac, pa);
-                   sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        ext4_mb_use_group_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3812,10 +3790,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        ext4_mb_store_history(ac);
                }
-                trace_mark(ext4_mb_release_inode_pa,
+                trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
-                           "dev %s ino %lu block %llu count %u",
+                                               next - bit);
-                           sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
-                           next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3849,8 +3825,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        if (ac)
                ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-        trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
+        trace_ext4_mb_release_group_pa(ac, pa);
-                   sb->s_id, pa->pa_pstart, pa->pa_len);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3918,6 +3893,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        INIT_LIST_HEAD(&list);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+        if (ac)
+                ac->ac_sb = sb;
 repeat:
        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
@@ -4016,12 +3993,15 @@ void ext4_discard_preallocations(struct inode *inode)
        }
        mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
-        trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
+        trace_ext4_discard_preallocations(inode);
-                   inode->i_ino);
        INIT_LIST_HEAD(&list);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+        if (ac) {
+                ac->ac_sb = sb;
+                ac->ac_inode = inode;
+        }
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
@@ -4121,7 +4101,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
-        ext4_group_t i;
+        ext4_group_t ngroups, i;
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
                        " Allocation context details:\n");
@@ -4145,7 +4125,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
        printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
                ac->ac_found);
        printk(KERN_ERR "EXT4-fs: groups: \n");
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        ngroups = ext4_get_groups_count(sb);
+        for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
                struct ext4_prealloc_space *pa;
                ext4_grpblk_t start;
@@ -4246,14 +4227,9 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ext4_get_group_no_and_offset(sb, goal, &group, &block);
        /* set up allocation goals */
+        memset(ac, 0, sizeof(struct ext4_allocation_context));
        ac->ac_b_ex.fe_logical = ar->logical;
-        ac->ac_b_ex.fe_group = 0;
-        ac->ac_b_ex.fe_start = 0;
-        ac->ac_b_ex.fe_len = 0;
        ac->ac_status = AC_STATUS_CONTINUE;
-        ac->ac_groups_scanned = 0;
-        ac->ac_ex_scanned = 0;
-        ac->ac_found = 0;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
        ac->ac_o_ex.fe_logical = ar->logical;
@@ -4264,15 +4240,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ac->ac_g_ex.fe_group = group;
        ac->ac_g_ex.fe_start = block;
        ac->ac_g_ex.fe_len = len;
-        ac->ac_f_ex.fe_len = 0;
        ac->ac_flags = ar->flags;
-        ac->ac_2order = 0;
-        ac->ac_criteria = 0;
-        ac->ac_pa = NULL;
-        ac->ac_bitmap_page = NULL;
-        ac->ac_buddy_page = NULL;
-        ac->alloc_semp = NULL;
-        ac->ac_lg = NULL;
        /* we have to define context: we'll we work with a file or
         * locality group. this is a policy, actually */
@@ -4304,6 +4272,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
        INIT_LIST_HEAD(&discard_list);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+        if (ac)
+                ac->ac_sb = sb;
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4469,13 +4439,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 {
-        ext4_group_t i;
+        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        int ret;
        int freed = 0;
-        trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
+        trace_ext4_mb_discard_preallocations(sb, needed);
-                   sb->s_id, needed);
+        for (i = 0; i < ngroups && needed > 0; i++) {
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
                freed += ret;
                needed -= ret;
@@ -4503,17 +4472,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);
-        trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
+        trace_ext4_request_blocks(ar);
-                   "lblk %llu goal %llu lleft %llu lright %llu "
-                   "pleft %llu pright %llu ",
-                   sb->s_id, ar->flags, ar->len,
-                   ar->inode ? ar->inode->i_ino : 0,
-                   (unsigned long long) ar->logical,
-                   (unsigned long long) ar->goal,
-                   (unsigned long long) ar->lleft,
-                   (unsigned long long) ar->lright,
-                   (unsigned long long) ar->pleft,
-                   (unsigned long long) ar->pright);
        /*
         * For delayed allocation, we could skip the ENOSPC and
@@ -4622,18 +4581,7 @@ out3:
                                                reserv_blks);
        }
-        trace_mark(ext4_allocate_blocks,
+        trace_ext4_allocate_blocks(ar, (unsigned long long)block);
-                   "dev %s block %llu flags %u len %u ino %lu "
-                   "logical %llu goal %llu lleft %llu lright %llu "
-                   "pleft %llu pright %llu ",
-                   sb->s_id, (unsigned long long) block,
-                   ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
-                   (unsigned long long) ar->logical,
-                   (unsigned long long) ar->goal,
-                   (unsigned long long) ar->lleft,
-                   (unsigned long long) ar->lright,
-                   (unsigned long long) ar->pleft,
-                   (unsigned long long) ar->pright);
        return block;
 }
@@ -4737,7 +4685,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 * Main entry point into mballoc to free blocks
 */
 void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
-                        unsigned long block, unsigned long count,
+                        ext4_fsblk_t block, unsigned long count,
                        int metadata, unsigned long *freed)
 {
        struct buffer_head *bitmap_bh = NULL;
@@ -4763,15 +4711,12 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
            block + count > ext4_blocks_count(es)) {
                ext4_error(sb, __func__,
                            "Freeing blocks not in datazone - "
-                            "block = %lu, count = %lu", block, count);
+                            "block = %llu, count = %lu", block, count);
                goto error_return;
        }
-        ext4_debug("freeing block %lu\n", block);
+        ext4_debug("freeing block %llu\n", block);
-        trace_mark(ext4_free_blocks,
+        trace_ext4_free_blocks(inode, block, count, metadata);
-                   "dev %s block %llu count %lu metadata %d ino %lu",
-                   sb->s_id, (unsigned long long) block, count, metadata,
-                   inode ? inode->i_ino : 0);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
@@ -4812,7 +4757,7 @@ do_more:
                ext4_error(sb, __func__,
                           "Freeing blocks in system zone - "
-                           "Block = %lu, count = %lu", block, count);
+                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_return;
        }
@@ -4859,29 +4804,25 @@ do_more:
                new_entry->group  = block_group;
                new_entry->count = count;
                new_entry->t_tid = handle->h_transaction->t_tid;
                ext4_lock_group(sb, block_group);
-                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                mb_clear_bits(bitmap_bh->b_data, bit, count);
-                                bit, count);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
-                ext4_unlock_group(sb, block_group);
        } else {
-                ext4_lock_group(sb, block_group);
                /* need to update group_info->bb_free and bitmap
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
-                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                ext4_lock_group(sb, block_group);
-                                bit, count);
+                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-                ext4_unlock_group(sb, block_group);
        }
-        spin_lock(sb_bgl_lock(sbi, block_group));
        ret = ext4_free_blks_count(sb, gdp) + count;
        ext4_free_blks_set(sb, gdp, ret);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-        spin_unlock(sb_bgl_lock(sbi, block_group));
+        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
        if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd9e6cd5f6cf..c96bb19f58f9 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -19,11 +19,9 @@
 #include <linux/seq_file.h>
 #include <linux/version.h>
 #include <linux/blkdev.h>
-#include <linux/marker.h>
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
-#include "group.h"
 /*
 * with AGGRESSIVE_CHECK allocator runs consistency checks over
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index fe64d9f79852..313a50b39741 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -458,6 +458,7 @@ int ext4_ext_migrate(struct inode *inode)
        struct inode *tmp_inode = NULL;
        struct list_blocks_struct lb;
        unsigned long max_entries;
+        __u32 goal;
        /*
         * If the filesystem does not support extents, or the inode
@@ -483,9 +484,10 @@ int ext4_ext_migrate(struct inode *inode)
                retval = PTR_ERR(handle);
                return retval;
        }
-        tmp_inode = ext4_new_inode(handle,
+        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
-                                inode->i_sb->s_root->d_inode,
+                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
-                                S_IFREG);
+        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+                                   S_IFREG, 0, goal);
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 000000000000..bbf2dd9404dc
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ *            Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "ext4.h"
+#define get_ext_path(path, inode, block, ret)           \
+        do {                                                            \
+                path = ext4_ext_find_extent(inode, block, path);        \
+                if (IS_ERR(path)) {                                     \
+                        ret = PTR_ERR(path);                            \
+                        path = NULL;                                    \
+                }                                                       \
+        } while (0)
+/**
+ * copy_extent_status - Copy the extent's initialization status
+ *
+ * @src:        an extent for getting initialize status
+ * @dest:       an extent to be set the status
+ */
+static void
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+        if (ext4_ext_is_uninitialized(src))
+                ext4_ext_mark_uninitialized(dest);
+        else
+                dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+/**
+ * mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode:      inode which is searched
+ * @path:       this will obtain data for the next extent
+ * @extent:     pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+static int
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+                      struct ext4_extent **extent)
+{
+        int ppos, leaf_ppos = path->p_depth;
+        ppos = leaf_ppos;
+        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+                /* leaf block */
+                *extent = ++path[ppos].p_ext;
+                return 0;
+        }
+        while (--ppos >= 0) {
+                if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+                    path[ppos].p_idx) {
+                        int cur_ppos = ppos;
+                        /* index block */
+                        path[ppos].p_idx++;
+                        path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                        if (path[ppos+1].p_bh)
+                                brelse(path[ppos+1].p_bh);
+                        path[ppos+1].p_bh =
+                                sb_bread(inode->i_sb, path[ppos].p_block);
+                        if (!path[ppos+1].p_bh)
+                                return -EIO;
+                        path[ppos+1].p_hdr =
+                                ext_block_hdr(path[ppos+1].p_bh);
+                        /* Halfway index block */
+                        while (++cur_ppos < leaf_ppos) {
+                                path[cur_ppos].p_idx =
+                                        EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+                                path[cur_ppos].p_block =
+                                        idx_pblock(path[cur_ppos].p_idx);
+                                if (path[cur_ppos+1].p_bh)
+                                        brelse(path[cur_ppos+1].p_bh);
+                                path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+                                        path[cur_ppos].p_block);
+                                if (!path[cur_ppos+1].p_bh)
+                                        return -EIO;
+                                path[cur_ppos+1].p_hdr =
+                                        ext_block_hdr(path[cur_ppos+1].p_bh);
+                        }
+                        /* leaf block */
+                        path[leaf_ppos].p_ext = *extent =
+                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+                        return 0;
+                }
+        }
+        /* We found the last extent */
+        return 1;
+}
+/**
+ * mext_double_down_read - Acquire two inodes' read semaphore
+ *
+ * @orig_inode:         original inode structure
+ * @donor_inode:        donor inode structure
+ * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+        struct inode *first = orig_inode, *second = donor_inode;
+        BUG_ON(orig_inode == NULL || donor_inode == NULL);
+        /*
+         * Use the inode number to provide the stable locking order instead
+         * of its address, because the C language doesn't guarantee you can
+         * compare pointers that don't come from the same array.
+         */
+        if (donor_inode->i_ino < orig_inode->i_ino) {
+                first = donor_inode;
+                second = orig_inode;
+        }
+        down_read(&EXT4_I(first)->i_data_sem);
+        down_read(&EXT4_I(second)->i_data_sem);
+}
+/**
+ * mext_double_down_write - Acquire two inodes' write semaphore
+ *
+ * @orig_inode:         original inode structure
+ * @donor_inode:        donor inode structure
+ * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+        struct inode *first = orig_inode, *second = donor_inode;
+        BUG_ON(orig_inode == NULL || donor_inode == NULL);
+        /*
+         * Use the inode number to provide the stable locking order instead
+         * of its address, because the C language doesn't guarantee you can
+         * compare pointers that don't come from the same array.
+         */
+        if (donor_inode->i_ino < orig_inode->i_ino) {
+                first = donor_inode;
+                second = orig_inode;
+        }
+        down_write(&EXT4_I(first)->i_data_sem);
+        down_write(&EXT4_I(second)->i_data_sem);
+}
+/**
+ * mext_double_up_read - Release two inodes' read semaphore
+ *
+ * @orig_inode:         original inode structure to be released its lock first
+ * @donor_inode:        donor inode structure to be released its lock second
+ * Release read semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+        BUG_ON(orig_inode == NULL || donor_inode == NULL);
+        up_read(&EXT4_I(orig_inode)->i_data_sem);
+        up_read(&EXT4_I(donor_inode)->i_data_sem);
+}
+/**
+ * mext_double_up_write - Release two inodes' write semaphore
+ *
+ * @orig_inode:         original inode structure to be released its lock first
+ * @donor_inode:        donor inode structure to be released its lock second
+ * Release write semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+        BUG_ON(orig_inode == NULL || donor_inode == NULL);
+        up_write(&EXT4_I(orig_inode)->i_data_sem);
+        up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+/**
+ * mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle:             journal handle
+ * @orig_inode:         original inode
+ * @o_start:            first original extent to be changed
+ * @o_end:              last original extent to be changed
+ * @start_ext:          first new extent to be inserted
+ * @new_ext:            middle of new extent to be inserted
+ * @end_ext:            last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+                struct ext4_extent *o_start, struct ext4_extent *o_end,
+                struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+                struct ext4_extent *end_ext)
+{
+        struct ext4_ext_path *orig_path = NULL;
+        ext4_lblk_t eblock = 0;
+        int new_flag = 0;
+        int end_flag = 0;
+        int err = 0;
+        if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+                if (o_start == o_end) {
+                        /*       start_ext   new_ext    end_ext
+                         * donor |---------|-----------|--------|
+                         * orig  |------------------------------|
+                         */
+                        end_flag = 1;
+                } else {
+                        /*       start_ext   new_ext   end_ext
+                         * donor |---------|----------|---------|
+                         * orig  |---------------|--------------|
+                         */
+                        o_end->ee_block = end_ext->ee_block;
+                        o_end->ee_len = end_ext->ee_len;
+                        ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                }
+                o_start->ee_len = start_ext->ee_len;
+                new_flag = 1;
+        } else if (start_ext->ee_len && new_ext->ee_len &&
+                   !end_ext->ee_len && o_start == o_end) {
+                /*       start_ext      new_ext
+                 * donor |--------------|---------------|
+                 * orig  |------------------------------|
+                 */
+                o_start->ee_len = start_ext->ee_len;
+                new_flag = 1;
+        } else if (!start_ext->ee_len && new_ext->ee_len &&
+                   end_ext->ee_len && o_start == o_end) {
+                /*        new_ext       end_ext
+                 * donor |--------------|---------------|
+                 * orig  |------------------------------|
+                 */
+                o_end->ee_block = end_ext->ee_block;
+                o_end->ee_len = end_ext->ee_len;
+                ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                /*
+                 * Set 0 to the extent block if new_ext was
+                 * the first block.
+                 */
+                if (new_ext->ee_block)
+                        eblock = le32_to_cpu(new_ext->ee_block);
+                new_flag = 1;
+        } else {
+                ext4_debug("ext4 move extent: Unexpected insert case\n");
+                return -EIO;
+        }
+        if (new_flag) {
+                get_ext_path(orig_path, orig_inode, eblock, err);
+                if (orig_path == NULL)
+                        goto out;
+                if (ext4_ext_insert_extent(handle, orig_inode,
+                                        orig_path, new_ext))
+                        goto out;
+        }
+        if (end_flag) {
+                get_ext_path(orig_path, orig_inode,
+                                      le32_to_cpu(end_ext->ee_block) - 1, err);
+                if (orig_path == NULL)
+                        goto out;
+                if (ext4_ext_insert_extent(handle, orig_inode,
+                                           orig_path, end_ext))
+                        goto out;
+        }
+out:
+        if (orig_path) {
+                ext4_ext_drop_refs(orig_path);
+                kfree(orig_path);
+        }
+        return err;
+}
+/**
+ * mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start:            first original extent to be moved
+ * @o_end:              last original extent to be moved
+ * @start_ext:          first new extent to be inserted
+ * @new_ext:            middle of new extent to be inserted
+ * @end_ext:            last new extent to be inserted
+ * @eh:                 extent header of target leaf block
+ * @range_to_move:      used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+mext_insert_inside_block(struct ext4_extent *o_start,
+                              struct ext4_extent *o_end,
+                              struct ext4_extent *start_ext,
+                              struct ext4_extent *new_ext,
+                              struct ext4_extent *end_ext,
+                              struct ext4_extent_header *eh,
+                              int range_to_move)
+{
+        int i = 0;
+        unsigned long len;
+        /* Move the existing extents */
+        if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+                len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+                        (unsigned long)(o_end + 1);
+                memmove(o_end + 1 + range_to_move, o_end + 1, len);
+        }
+        /* Insert start entry */
+        if (start_ext->ee_len)
+                o_start[i++].ee_len = start_ext->ee_len;
+        /* Insert new entry */
+        if (new_ext->ee_len) {
+                o_start[i] = *new_ext;
+                ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+        }
+        /* Insert end entry */
+        if (end_ext->ee_len)
+                o_start[i] = *end_ext;
+        /* Increment the total entries counter on the extent block */
+        le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+/**
+ * mext_insert_extents - Insert new extent
+ *
+ * @handle:     journal handle
+ * @orig_inode: original inode
+ * @orig_path:  path indicates first extent to be changed
+ * @o_start:    first original extent to be changed
+ * @o_end:      last original extent to be changed
+ * @start_ext:  first new extent to be inserted
+ * @new_ext:    middle of new extent to be inserted
+ * @end_ext:    last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+                         struct ext4_ext_path *orig_path,
+                         struct ext4_extent *o_start,
+                         struct ext4_extent *o_end,
+                         struct ext4_extent *start_ext,
+                         struct ext4_extent *new_ext,
+                         struct ext4_extent *end_ext)
+{
+        struct  ext4_extent_header *eh;
+        unsigned long need_slots, slots_range;
+        int     range_to_move, depth, ret;
+        /*
+         * The extents need to be inserted
+         * start_extent + new_extent + end_extent.
+         */
+        need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+                (new_ext->ee_len ? 1 : 0);
+        /* The number of slots between start and end */
+        slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+                / sizeof(struct ext4_extent);
+        /* Range to move the end of extent */
+        range_to_move = need_slots - slots_range;
+        depth = orig_path->p_depth;
+        orig_path += depth;
+        eh = orig_path->p_hdr;
+        if (depth) {
+                /* Register to journal */
+                ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+                if (ret)
+                        return ret;
+        }
+        /* Expansion */
+        if (range_to_move > 0 &&
+                (range_to_move > le16_to_cpu(eh->eh_max)
+                        - le16_to_cpu(eh->eh_entries))) {
+                ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+                                        o_end, start_ext, new_ext, end_ext);
+                if (ret < 0)
+                        return ret;
+        } else
+                mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+                                                end_ext, eh, range_to_move);
+        if (depth) {
+                ret = ext4_handle_dirty_metadata(handle, orig_inode,
+                                                 orig_path->p_bh);
+                if (ret)
+                        return ret;
+        } else {
+                ret = ext4_mark_inode_dirty(handle, orig_inode);
+                if (ret < 0)
+                        return ret;
+        }
+        return 0;
+}
+/**
+ * mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle:             journal handle
+ * @orig_inode:         original inode
+ * @orig_path:          path indicates first extent to be changed
+ * @dext:               donor extent
+ * @from:               start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+                     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+                     ext4_lblk_t *from)
+{
+        struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+        struct ext4_extent new_ext, start_ext, end_ext;
+        ext4_lblk_t new_ext_end;
+        ext4_fsblk_t new_phys_end;
+        int oext_alen, new_ext_alen, end_ext_alen;
+        int depth = ext_depth(orig_inode);
+        int ret;
+        o_start = o_end = oext = orig_path[depth].p_ext;
+        oext_alen = ext4_ext_get_actual_len(oext);
+        start_ext.ee_len = end_ext.ee_len = 0;
+        new_ext.ee_block = cpu_to_le32(*from);
+        ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+        new_ext.ee_len = dext->ee_len;
+        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+        new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
+        /*
+         * Case: original extent is first
+         * oext      |--------|
+         * new_ext      |--|
+         * start_ext |--|
+         */
+        if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+                le32_to_cpu(new_ext.ee_block) <
+                le32_to_cpu(oext->ee_block) + oext_alen) {
+                start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+                                               le32_to_cpu(oext->ee_block));
+                copy_extent_status(oext, &start_ext);
+        } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+                prev_ext = oext - 1;
+                /*
+                 * We can merge new_ext into previous extent,
+                 * if these are contiguous and same extent type.
+                 */
+                if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+                                               &new_ext)) {
+                        o_start = prev_ext;
+                        start_ext.ee_len = cpu_to_le16(
+                                ext4_ext_get_actual_len(prev_ext) +
+                                new_ext_alen);
+                        copy_extent_status(prev_ext, &start_ext);
+                        new_ext.ee_len = 0;
+                }
+        }
+        /*
+         * Case: new_ext_end must be less than oext
+         * oext      |-----------|
+         * new_ext       |-------|
+         */
+        BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+        /*
+         * Case: new_ext is smaller than original extent
+         * oext    |---------------|
+         * new_ext |-----------|
+         * end_ext             |---|
+         */
+        if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+                new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+                end_ext.ee_len =
+                        cpu_to_le16(le32_to_cpu(oext->ee_block) +
+                        oext_alen - 1 - new_ext_end);
+                copy_extent_status(oext, &end_ext);
+                end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+                ext4_ext_store_pblock(&end_ext,
+                        (ext_pblock(o_end) + oext_alen - end_ext_alen));
+                end_ext.ee_block =
+                        cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+                        oext_alen - end_ext_alen);
+        }
+        ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+                                o_end, &start_ext, &new_ext, &end_ext);
+        return ret;
+}
+/**
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
+ *
+ * @tmp_dext:           the extent that will belong to the original inode
+ * @tmp_oext:           the extent that will belong to the donor inode
+ * @orig_off:           block offset of original inode
+ * @donor_off:          block offset of donor inode
+ * @max_count:          the maximun length of extents
+ */
+static void
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+                              struct ext4_extent *tmp_oext,
+                              ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+                              ext4_lblk_t max_count)
+{
+        ext4_lblk_t diff, orig_diff;
+        struct ext4_extent dext_old, oext_old;
+        dext_old = *tmp_dext;
+        oext_old = *tmp_oext;
+        /* When tmp_dext is too large, pick up the target range. */
+        diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+        ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+        tmp_dext->ee_block =
+                        cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
+        tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+        if (max_count < ext4_ext_get_actual_len(tmp_dext))
+                tmp_dext->ee_len = cpu_to_le16(max_count);
+        orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+        ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+        /* Adjust extent length if donor extent is larger than orig */
+        if (ext4_ext_get_actual_len(tmp_dext) >
+            ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+                tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+                                                orig_diff);
+        tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+        copy_extent_status(&oext_old, tmp_dext);
+        copy_extent_status(&dext_old, tmp_oext);
+}
+/**
+ * mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle:             journal handle
+ * @orig_inode:         original inode
+ * @donor_inode:        donor inode
+ * @from:               block offset of orig_inode
+ * @count:              block count to be replaced
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ *    dummy extents.
+ * 2. Change the block information of original inode to point at the
+ *    donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ *    original inode blocks in the dummy extents.
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+                           struct inode *donor_inode, ext4_lblk_t from,
+                           ext4_lblk_t count)
+{
+        struct ext4_ext_path *orig_path = NULL;
+        struct ext4_ext_path *donor_path = NULL;
+        struct ext4_extent *oext, *dext;
+        struct ext4_extent tmp_dext, tmp_oext;
+        ext4_lblk_t orig_off = from, donor_off = from;
+        int err = 0;
+        int depth;
+        int replaced_count = 0;
+        int dext_alen;
+        mext_double_down_write(orig_inode, donor_inode);
+        /* Get the original extent for the block "orig_off" */
+        get_ext_path(orig_path, orig_inode, orig_off, err);
+        if (orig_path == NULL)
+                goto out;
+        /* Get the donor extent for the head */
+        get_ext_path(donor_path, donor_inode, donor_off, err);
+        if (donor_path == NULL)
+                goto out;
+        depth = ext_depth(orig_inode);
+        oext = orig_path[depth].p_ext;
+        tmp_oext = *oext;
+        depth = ext_depth(donor_inode);
+        dext = donor_path[depth].p_ext;
+        tmp_dext = *dext;
+        mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+                                      donor_off, count);
+        /* Loop for the donor extents */
+        while (1) {
+                /* The extent for donor must be found. */
+                BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+                /* Set donor extent to orig extent */
+                err = mext_leaf_block(handle, orig_inode,
+                                           orig_path, &tmp_dext, &orig_off);
+                if (err < 0)
+                        goto out;
+                /* Set orig extent to donor extent */
+                err = mext_leaf_block(handle, donor_inode,
+                                           donor_path, &tmp_oext, &donor_off);
+                if (err < 0)
+                        goto out;
+                dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+                replaced_count += dext_alen;
+                donor_off += dext_alen;
+                orig_off += dext_alen;
+                /* Already moved the expected blocks */
+                if (replaced_count >= count)
+                        break;
+                if (orig_path)
+                        ext4_ext_drop_refs(orig_path);
+                get_ext_path(orig_path, orig_inode, orig_off, err);
+                if (orig_path == NULL)
+                        goto out;
+                depth = ext_depth(orig_inode);
+                oext = orig_path[depth].p_ext;
+                if (le32_to_cpu(oext->ee_block) +
+                                ext4_ext_get_actual_len(oext) <= orig_off) {
+                        err = 0;
+                        goto out;
+                }
+                tmp_oext = *oext;
+                if (donor_path)
+                        ext4_ext_drop_refs(donor_path);
+                get_ext_path(donor_path, donor_inode,
+                                      donor_off, err);
+                if (donor_path == NULL)
+                        goto out;
+                depth = ext_depth(donor_inode);
+                dext = donor_path[depth].p_ext;
+                if (le32_to_cpu(dext->ee_block) +
+                                ext4_ext_get_actual_len(dext) <= donor_off) {
+                        err = 0;
+                        goto out;
+                }
+                tmp_dext = *dext;
+                mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+                                              donor_off,
+                                              count - replaced_count);
+        }
+out:
+        if (orig_path) {
+                ext4_ext_drop_refs(orig_path);
+                kfree(orig_path);
+        }
+        if (donor_path) {
+                ext4_ext_drop_refs(donor_path);
+                kfree(donor_path);
+        }
+        mext_double_up_write(orig_inode, donor_inode);
+        return err;
+}
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp:                     file structure of original file
+ * @donor_inode:                donor inode
+ * @orig_page_offset:           page index on original file
+ * @data_offset_in_page:        block index where data swapping starts
+ * @block_len_in_page:          the number of blocks to be swapped
+ * @uninit:                     orig extent is uninitialized or not
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+                  pgoff_t orig_page_offset, int data_offset_in_page,
+                  int block_len_in_page, int uninit)
+{
+        struct inode *orig_inode = o_filp->f_dentry->d_inode;
+        struct address_space *mapping = orig_inode->i_mapping;
+        struct buffer_head *bh;
+        struct page *page = NULL;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        handle_t *handle;
+        ext4_lblk_t orig_blk_offset;
+        long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+        unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+        unsigned int w_flags = 0;
+        unsigned int tmp_data_len, data_len;
+        void *fsdata;
+        int ret, i, jblocks;
+        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+        /*
+         * It needs twice the amount of ordinary journal buffers because
+         * inode and donor_inode may change each different metadata blocks.
+         */
+        jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+        handle = ext4_journal_start(orig_inode, jblocks);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                return ret;
+        }
+        if (segment_eq(get_fs(), KERNEL_DS))
+                w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+        orig_blk_offset = orig_page_offset * blocks_per_page +
+                data_offset_in_page;
+        /*
+         * If orig extent is uninitialized one,
+         * it's not necessary force the page into memory
+         * and then force it to be written out again.
+         * Just swap data blocks between orig and donor.
+         */
+        if (uninit) {
+                ret = mext_replace_branches(handle, orig_inode,
+                                                 donor_inode, orig_blk_offset,
+                                                 block_len_in_page);
+                /* Clear the inode cache not to refer to the old data */
+                ext4_ext_invalidate_cache(orig_inode);
+                ext4_ext_invalidate_cache(donor_inode);
+                goto out2;
+        }
+        offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+        /* Calculate data_len */
+        if ((orig_blk_offset + block_len_in_page - 1) ==
+            ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+                /* Replace the last block */
+                tmp_data_len = orig_inode->i_size & (blocksize - 1);
+                /*
+                 * If data_len equal zero, it shows data_len is multiples of
+                 * blocksize. So we set appropriate value.
+                 */
+                if (tmp_data_len == 0)
+                        tmp_data_len = blocksize;
+                data_len = tmp_data_len +
+                        ((block_len_in_page - 1) << orig_inode->i_blkbits);
+        } else {
+                data_len = block_len_in_page << orig_inode->i_blkbits;
+        }
+        ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+                                 &page, &fsdata);
+        if (unlikely(ret < 0))
+                goto out;
+        if (!PageUptodate(page)) {
+                mapping->a_ops->readpage(o_filp, page);
+                lock_page(page);
+        }
+        /*
+         * try_to_release_page() doesn't call releasepage in writeback mode.
+         * We should care about the order of writing to the same file
+         * by multiple move extent processes.
+         * It needs to call wait_on_page_writeback() to wait for the
+         * writeback of the page.
+         */
+        if (PageWriteback(page))
+                wait_on_page_writeback(page);
+        /* Release old bh and drop refs */
+        try_to_release_page(page, 0);
+        ret = mext_replace_branches(handle, orig_inode, donor_inode,
+                                         orig_blk_offset, block_len_in_page);
+        if (ret < 0)
+                goto out;
+        /* Clear the inode cache not to refer to the old data */
+        ext4_ext_invalidate_cache(orig_inode);
+        ext4_ext_invalidate_cache(donor_inode);
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+        bh = page_buffers(page);
+        for (i = 0; i < data_offset_in_page; i++)
+                bh = bh->b_this_page;
+        for (i = 0; i < block_len_in_page; i++) {
+                ret = ext4_get_block(orig_inode,
+                                (sector_t)(orig_blk_offset + i), bh, 0);
+                if (ret < 0)
+                        goto out;
+                if (bh->b_this_page != NULL)
+                        bh = bh->b_this_page;
+        }
+        ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+                               page, fsdata);
+        page = NULL;
+out:
+        if (unlikely(page)) {
+                if (PageLocked(page))
+                        unlock_page(page);
+                page_cache_release(page);
+        }
+out2:
+        ext4_journal_stop(handle);
+        return ret < 0 ? ret : 0;
+}
+/**
+ * mext_check_argumants - Check whether move extent can be done
+ *
+ * @orig_inode:         original inode
+ * @donor_inode:        donor inode
+ * @orig_start:         logical start offset in block for orig
+ * @donor_start:        logical start offset in block for donor
+ * @len:                the number of blocks to be moved
+ * @moved_len:          moved block length
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+                          struct inode *donor_inode, __u64 orig_start,
+                          __u64 donor_start, __u64 *len, __u64 moved_len)
+{
+        /* Regular file check */
+        if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+                ext4_debug("ext4 move extent: The argument files should be "
+                        "regular file [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        /* Ext4 move extent does not support swapfile */
+        if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+                ext4_debug("ext4 move extent: The argument files should "
+                        "not be swapfile [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        /* Files should be in the same ext4 FS */
+        if (orig_inode->i_sb != donor_inode->i_sb) {
+                ext4_debug("ext4 move extent: The argument files "
+                        "should be in same FS [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        /* orig and donor should be different file */
+        if (orig_inode->i_ino == donor_inode->i_ino) {
+                ext4_debug("ext4 move extent: The argument files should not "
+                        "be same file [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        /* Ext4 move extent supports only extent based file */
+        if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+                ext4_debug("ext4 move extent: orig file is not extents "
+                        "based file [ino:orig %lu]\n", orig_inode->i_ino);
+                return -EOPNOTSUPP;
+        } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+                ext4_debug("ext4 move extent: donor file is not extents "
+                        "based file [ino:donor %lu]\n", donor_inode->i_ino);
+                return -EOPNOTSUPP;
+        }
+        if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+                ext4_debug("ext4 move extent: File size is 0 byte\n");
+                return -EINVAL;
+        }
+        /* Start offset should be same */
+        if (orig_start != donor_start) {
+                ext4_debug("ext4 move extent: orig and donor's start "
+                        "offset are not same [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        if (moved_len) {
+                ext4_debug("ext4 move extent: moved_len should be 0 "
+                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+                        donor_inode->i_ino);
+                return -EINVAL;
+        }
+        if ((orig_start > MAX_DEFRAG_SIZE) ||
+            (donor_start > MAX_DEFRAG_SIZE) ||
+            (*len > MAX_DEFRAG_SIZE) ||
+            (orig_start + *len > MAX_DEFRAG_SIZE))  {
+                ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
+                        "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        if (orig_inode->i_size > donor_inode->i_size) {
+                if (orig_start >= donor_inode->i_size) {
+                        ext4_debug("ext4 move extent: orig start offset "
+                        "[%llu] should be less than donor file size "
+                        "[%lld] [ino:orig %lu, donor_inode %lu]\n",
+                        orig_start, donor_inode->i_size,
+                        orig_inode->i_ino, donor_inode->i_ino);
+                        return -EINVAL;
+                }
+                if (orig_start + *len > donor_inode->i_size) {
+                        ext4_debug("ext4 move extent: End offset [%llu] should "
+                                "be less than donor file size [%lld]."
+                                "So adjust length from %llu to %lld "
+                                "[ino:orig %lu, donor %lu]\n",
+                                orig_start + *len, donor_inode->i_size,
+                                *len, donor_inode->i_size - orig_start,
+                                orig_inode->i_ino, donor_inode->i_ino);
+                        *len = donor_inode->i_size - orig_start;
+                }
+        } else {
+                if (orig_start >= orig_inode->i_size) {
+                        ext4_debug("ext4 move extent: start offset [%llu] "
+                                "should be less than original file size "
+                                "[%lld] [inode:orig %lu, donor %lu]\n",
+                                 orig_start, orig_inode->i_size,
+                                orig_inode->i_ino, donor_inode->i_ino);
+                        return -EINVAL;
+                }
+                if (orig_start + *len > orig_inode->i_size) {
+                        ext4_debug("ext4 move extent: Adjust length "
+                                "from %llu to %lld. Because it should be "
+                                "less than original file size "
+                                "[ino:orig %lu, donor %lu]\n",
+                                *len, orig_inode->i_size - orig_start,
+                                orig_inode->i_ino, donor_inode->i_ino);
+                        *len = orig_inode->i_size - orig_start;
+                }
+        }
+        if (!*len) {
+                ext4_debug("ext4 move extent: len shoudld not be 0 "
+                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+                        donor_inode->i_ino);
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:     the inode structure
+ * @inode2:     the inode structure
+ *
+ * Lock two inodes' i_mutex by i_ino order. This function is moved from
+ * fs/inode.c.
+ */
+static void
+mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+{
+        if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
+                if (inode1)
+                        mutex_lock(&inode1->i_mutex);
+                else if (inode2)
+                        mutex_lock(&inode2->i_mutex);
+                return;
+        }
+        if (inode1->i_ino < inode2->i_ino) {
+                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+        } else {
+                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+        }
+}
+/**
+ * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:     the inode that is released first
+ * @inode2:     the inode that is released second
+ *
+ * This function is moved from fs/inode.c.
+ */
+static void
+mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+        if (inode1)
+                mutex_unlock(&inode1->i_mutex);
+        if (inode2 && inode2 != inode1)
+                mutex_unlock(&inode2->i_mutex);
+}
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp:             file structure of the original file
+ * @d_filp:             file structure of the donor file
+ * @orig_start:         start offset in block for orig
+ * @donor_start:        start offset in block for donor
+ * @len:                the number of blocks to be moved
+ * @moved_len:          moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_move_extents() proceeds the following order.
+ * 1:ext4_move_extents() calculates the last block number of moving extent
+ *   function by the start block number (orig_start) and the number of blocks
+ *   to be moved (len) specified as arguments.
+ *   If the {orig, donor}_start points a hole, the extent's start offset
+ *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ *   after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ *   or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call mext_next_extent()
+ *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ *   until find un-continuous extent, the start logical block number exceeds
+ *   the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ *   from orig_page_offset to seq_end_page.
+ *   The start indexes of data are specified as arguments.
+ *   That of the original inode is orig_page_offset,
+ *   and the donor inode is also orig_page_offset
+ *   (To easily handle blocksize != pagesize case, the offset for the
+ *   donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ *   then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ *   which shows the number of moved blocks.
+ *   The moved_len is useful for the command to calculate the file offset
+ *   for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp,
+                 __u64 orig_start, __u64 donor_start, __u64 len,
+                 __u64 *moved_len)
+{
+        struct inode *orig_inode = o_filp->f_dentry->d_inode;
+        struct inode *donor_inode = d_filp->f_dentry->d_inode;
+        struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+        struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+        ext4_lblk_t block_start = orig_start;
+        ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+        ext4_lblk_t rest_blocks;
+        pgoff_t orig_page_offset = 0, seq_end_page;
+        int ret, depth, last_extent = 0;
+        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+        int data_offset_in_page;
+        int block_len_in_page;
+        int uninit;
+        /* protect orig and donor against a truncate */
+        mext_inode_double_lock(orig_inode, donor_inode);
+        mext_double_down_read(orig_inode, donor_inode);
+        /* Check the filesystem environment whether move_extent can be done */
+        ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+                                        donor_start, &len, *moved_len);
+        mext_double_up_read(orig_inode, donor_inode);
+        if (ret)
+                goto out2;
+        file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+        block_end = block_start + len - 1;
+        if (file_end < block_end)
+                len -= block_end - file_end;
+        get_ext_path(orig_path, orig_inode, block_start, ret);
+        if (orig_path == NULL)
+                goto out2;
+        /* Get path structure to check the hole */
+        get_ext_path(holecheck_path, orig_inode, block_start, ret);
+        if (holecheck_path == NULL)
+                goto out;
+        depth = ext_depth(orig_inode);
+        ext_cur = holecheck_path[depth].p_ext;
+        if (ext_cur == NULL) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * Get proper extent whose ee_block is beyond block_start
+         * if block_start was within the hole.
+         */
+        if (le32_to_cpu(ext_cur->ee_block) +
+                ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+                last_extent = mext_next_extent(orig_inode,
+                                        holecheck_path, &ext_cur);
+                if (last_extent < 0) {
+                        ret = last_extent;
+                        goto out;
+                }
+                last_extent = mext_next_extent(orig_inode, orig_path,
+                                                        &ext_dummy);
+                if (last_extent < 0) {
+                        ret = last_extent;
+                        goto out;
+                }
+        }
+        seq_start = block_start;
+        /* No blocks within the specified range. */
+        if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+                ext4_debug("ext4 move extent: The specified range of file "
+                                                        "may be the hole\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        /* Adjust start blocks */
+        add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+                         ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+                     max(le32_to_cpu(ext_cur->ee_block), block_start);
+        while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+                seq_blocks += add_blocks;
+                /* Adjust tail blocks */
+                if (seq_start + seq_blocks - 1 > block_end)
+                        seq_blocks = block_end - seq_start + 1;
+                ext_prev = ext_cur;
+                last_extent = mext_next_extent(orig_inode, holecheck_path,
+                                                &ext_cur);
+                if (last_extent < 0) {
+                        ret = last_extent;
+                        break;
+                }
+                add_blocks = ext4_ext_get_actual_len(ext_cur);
+                /*
+                 * Extend the length of contiguous block (seq_blocks)
+                 * if extents are contiguous.
+                 */
+                if (ext4_can_extents_be_merged(orig_inode,
+                                               ext_prev, ext_cur) &&
+                    block_end >= le32_to_cpu(ext_cur->ee_block) &&
+                    !last_extent)
+                        continue;
+                /* Is original extent is uninitialized */
+                uninit = ext4_ext_is_uninitialized(ext_prev);
+                data_offset_in_page = seq_start % blocks_per_page;
+                /*
+                 * Calculate data blocks count that should be swapped
+                 * at the first page.
+                 */
+                if (data_offset_in_page + seq_blocks > blocks_per_page) {
+                        /* Swapped blocks are across pages */
+                        block_len_in_page =
+                                        blocks_per_page - data_offset_in_page;
+                } else {
+                        /* Swapped blocks are in a page */
+                        block_len_in_page = seq_blocks;
+                }
+                orig_page_offset = seq_start >>
+                                (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+                seq_end_page = (seq_start + seq_blocks - 1) >>
+                                (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+                seq_start = le32_to_cpu(ext_cur->ee_block);
+                rest_blocks = seq_blocks;
+                /* Discard preallocations of two inodes */
+                down_write(&EXT4_I(orig_inode)->i_data_sem);
+                ext4_discard_preallocations(orig_inode);
+                up_write(&EXT4_I(orig_inode)->i_data_sem);
+                down_write(&EXT4_I(donor_inode)->i_data_sem);
+                ext4_discard_preallocations(donor_inode);
+                up_write(&EXT4_I(donor_inode)->i_data_sem);
+                while (orig_page_offset <= seq_end_page) {
+                        /* Swap original branches with new branches */
+                        ret = move_extent_par_page(o_filp, donor_inode,
+                                                orig_page_offset,
+                                                data_offset_in_page,
+                                                block_len_in_page, uninit);
+                        if (ret < 0)
+                                goto out;
+                        orig_page_offset++;
+                        /* Count how many blocks we have exchanged */
+                        *moved_len += block_len_in_page;
+                        BUG_ON(*moved_len > len);
+                        data_offset_in_page = 0;
+                        rest_blocks -= block_len_in_page;
+                        if (rest_blocks > blocks_per_page)
+                                block_len_in_page = blocks_per_page;
+                        else
+                                block_len_in_page = rest_blocks;
+                }
+                /* Decrease buffer counter */
+                if (holecheck_path)
+                        ext4_ext_drop_refs(holecheck_path);
+                get_ext_path(holecheck_path, orig_inode,
+                                      seq_start, ret);
+                if (holecheck_path == NULL)
+                        break;
+                depth = holecheck_path->p_depth;
+                /* Decrease buffer counter */
+                if (orig_path)
+                        ext4_ext_drop_refs(orig_path);
+                get_ext_path(orig_path, orig_inode, seq_start, ret);
+                if (orig_path == NULL)
+                        break;
+                ext_cur = holecheck_path[depth].p_ext;
+                add_blocks = ext4_ext_get_actual_len(ext_cur);
+                seq_blocks = 0;
+        }
+out:
+        if (orig_path) {
+                ext4_ext_drop_refs(orig_path);
+                kfree(orig_path);
+        }
+        if (holecheck_path) {
+                ext4_ext_drop_refs(holecheck_path);
+                kfree(holecheck_path);
+        }
+out2:
+        mext_inode_double_unlock(orig_inode, donor_inode);
+        if (ret)
+                return ret;
+        /* All of the specified blocks must be exchanged in succeed */
+        BUG_ON(*moved_len != len);
+        return 0;
+}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 22098e1cd085..de04013d16ff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -37,7 +37,6 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "namei.h"
 #include "xattr.h"
 #include "acl.h"
@@ -750,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                        ext4fs_dirhash(de->name, de->name_len, &h);
                        map_tail--;
                        map_tail->hash = h.hash;
-                        map_tail->offs = (u16) ((char *) de - base);
+                        map_tail->offs = ((char *) de - base)>>2;
                        map_tail->size = le16_to_cpu(de->rec_len);
                        count++;
                        cond_resched();
@@ -1148,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
        unsigned rec_len = 0;
        while (count--) {
-                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+                                                (from + (map->offs<<2));
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
@@ -1782,7 +1782,7 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode (handle, dir, mode);
+        inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
@@ -1816,7 +1816,7 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode(handle, dir, mode);
+        inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -1853,7 +1853,8 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
+        inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
+                               &dentry->d_name, 0);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -1997,7 +1998,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        if (!ext4_handle_valid(handle))
                return 0;
-        lock_super(sb);
+        mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
        if (!list_empty(&EXT4_I(inode)->i_orphan))
                goto out_unlock;
@@ -2006,9 +2007,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        /* @@@ FIXME: Observation from aviro:
         * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
-         * here (on lock_super()), so race with ext4_link() which might bump
+         * here (on s_orphan_lock), so race with ext4_link() which might bump
         * ->i_nlink. For, say it, character device. Not a regular file,
         * not a directory, not a symlink and ->i_nlink > 0.
+         *
+         * tytso, 4/25/2009: I'm not sure how that could happen;
+         * shouldn't the fs core protect us from these sort of
+         * unlink()/link() races?
         */
        J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2050,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        jbd_debug(4, "orphan inode %lu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-        unlock_super(sb);
+        mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
        ext4_std_error(inode->i_sb, err);
        return err;
 }
@@ -2066,11 +2071,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
        if (!ext4_handle_valid(handle))
                return 0;
-        lock_super(inode->i_sb);
+        mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
-        if (list_empty(&ei->i_orphan)) {
+        if (list_empty(&ei->i_orphan))
-                unlock_super(inode->i_sb);
+                goto out;
-                return 0;
-        }
        ino_next = NEXT_ORPHAN(inode);
        prev = ei->i_orphan.prev;
@@ -2120,7 +2123,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 out_err:
        ext4_std_error(inode->i_sb, err);
 out:
-        unlock_super(inode->i_sb);
+        mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
        return err;
 out_brelse:
@@ -2262,7 +2265,8 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
+        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
+                               &dentry->d_name, 0);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2533,6 +2537,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .permission     = ext4_permission,
+        .fiemap         = ext4_fiemap,
 };
 const struct inode_operations ext4_special_inode_operations = {
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a00..000000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*  linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- *      Ben Dooks <ben@simtec.co.uk>
- *
-*/
-extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 546c7dd869e1..68b0351fc647 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "group.h"
 #define outside(b, first, last) ((b) < (first) || (b) >= (last))
 #define inside(b, first, last)  ((b) >= (first) && (b) < (last))
@@ -193,7 +192,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        if (IS_ERR(handle))
                return PTR_ERR(handle);
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                err = -EBUSY;
                goto exit_journal;
@@ -302,7 +301,7 @@ exit_bh:
        brelse(bh);
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
@@ -643,11 +642,12 @@ exit_free:
 * important part is that the new block and inode counts are in the backup
 * superblocks, and the location of the new group metadata in the GDT backups.
 *
- * We do not need lock_super() for this, because these blocks are not
+ * We do not need take the s_resize_lock for this, because these
- * otherwise touched by the filesystem code when it is mounted.  We don't
+ * blocks are not otherwise touched by the filesystem code when it is
- * need to worry about last changing from sbi->s_groups_count, because the
+ * mounted.  We don't need to worry about last changing from
- * worst that can happen is that we do not copy the full number of backups
+ * sbi->s_groups_count, because the worst that can happen is that we
- * at this time.  The resize which changed s_groups_count will backup again.
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
 */
 static void update_backups(struct super_block *sb,
                           int blk_off, char *data, int size)
@@ -809,7 +809,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                ext4_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
@@ -840,7 +840,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /*
         * OK, now we've set up the new group.  Time to make it active.
         *
-         * Current kernels don't lock all allocations via lock_super(),
+         * We do not lock all allocations via s_resize_lock
         * so we have to be safe wrt. concurrent accesses the group
         * data.  So we need to be careful to set all of the relevant
         * group descriptor data etc. *before* we enable the group.
@@ -900,12 +900,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         *
         * The precise rules we use are:
         *
-         * * Writers of s_groups_count *must* hold lock_super
+         * * Writers of s_groups_count *must* hold s_resize_lock
         * AND
         * * Writers must perform a smp_wmb() after updating all dependent
         *   data and before modifying the groups count
         *
-         * * Readers must hold lock_super() over the access
+         * * Readers must hold s_resize_lock over the access
         * OR
         * * Readers must perform an smp_rmb() after reading the groups count
         *   and before reading any dependent data.
@@ -948,7 +948,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        sb->s_dirt = 1;
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
        if (!err) {
@@ -986,7 +986,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
-         * taking lock_super() below. */
+         * taking the s_resize_lock below. */
        o_blocks_count = ext4_blocks_count(es);
        o_groups_count = EXT4_SB(sb)->s_groups_count;
@@ -1002,7 +1002,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                        " too large to resize to %llu blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
-                        ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
+                        ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
                return -EINVAL;
        }
@@ -1056,11 +1056,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&EXT4_SB(sb)->s_resize_lock);
        if (o_blocks_count != ext4_blocks_count(es)) {
                ext4_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
-                unlock_super(sb);
+                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                err = -EBUSY;
                goto exit_put;
@@ -1070,14 +1070,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                                                 EXT4_SB(sb)->s_sbh))) {
                ext4_warning(sb, __func__,
                             "error %d on journal write access", err);
-                unlock_super(sb);
+                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        sb->s_dirt = 1;
-        unlock_super(sb);
+        mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        /* We add the blocks to the bitmap and set the group need init bit */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f222..8f4f079e6b9a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>
 #include <linux/jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -36,7 +37,6 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/ctype.h>
-#include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -45,16 +45,23 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "namei.h"
-#include "group.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
+static int default_mb_history_length = 1000;
+module_param_named(default_mb_history_length, default_mb_history_length,
+                   int, 0644);
+MODULE_PARM_DESC(default_mb_history_length,
+                 "Default number of entries saved for mb_history");
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
-static int ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb, int sync);
-                              struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
 static void ext4_clear_journal_err(struct super_block *sb,
@@ -74,7 +81,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_block_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -82,7 +89,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_inode_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -90,7 +97,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_inode_table_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 __u32 ext4_free_blks_count(struct super_block *sb,
@@ -98,7 +105,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 }
 __u32 ext4_free_inodes_count(struct super_block *sb,
@@ -106,7 +113,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_free_inodes_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 }
 __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -114,7 +121,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 }
 __u32 ext4_itable_unused_count(struct super_block *sb,
@@ -122,7 +129,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_itable_unused_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 }
 void ext4_block_bitmap_set(struct super_block *sb,
@@ -202,8 +209,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        journal = EXT4_SB(sb)->s_journal;
        if (journal) {
                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, __func__,
+                        ext4_abort(sb, __func__, "Detected aborted journal");
-                                   "Detected aborted journal");
                        return ERR_PTR(-EROFS);
                }
                return jbd2_journal_start(journal, nblocks);
@@ -297,15 +303,15 @@ static void ext4_handle_error(struct super_block *sb)
        if (!test_opt(sb, ERRORS_CONT)) {
                journal_t *journal = EXT4_SB(sb)->s_journal;
-                EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+                EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
                if (journal)
                        jbd2_journal_abort(journal, -EIO);
        }
        if (test_opt(sb, ERRORS_RO)) {
-                printk(KERN_CRIT "Remounting filesystem read-only\n");
+                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
                sb->s_flags |= MS_RDONLY;
        }
-        ext4_commit_super(sb, es, 1);
+        ext4_commit_super(sb, 1);
        if (test_opt(sb, ERRORS_PANIC))
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
@@ -395,8 +401,6 @@ void ext4_abort(struct super_block *sb, const char *function,
 {
        va_list args;
-        printk(KERN_CRIT "ext4_abort called.\n");
        va_start(args, fmt);
        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
        vprintk(fmt, args);
@@ -409,14 +413,26 @@ void ext4_abort(struct super_block *sb, const char *function,
        if (sb->s_flags & MS_RDONLY)
                return;
-        printk(KERN_CRIT "Remounting filesystem read-only\n");
+        ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        sb->s_flags |= MS_RDONLY;
-        EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+        EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
        if (EXT4_SB(sb)->s_journal)
                jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
+void ext4_msg (struct super_block * sb, const char *prefix,
+                   const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+}
 void ext4_warning(struct super_block *sb, const char *function,
                  const char *fmt, ...)
 {
@@ -431,7 +447,7 @@ void ext4_warning(struct super_block *sb, const char *function,
 }
 void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
-                                const char *function, const char *fmt, ...)
+                           const char *function, const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
@@ -447,7 +463,7 @@ __acquires(bitlock)
        if (test_opt(sb, ERRORS_CONT)) {
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                ext4_commit_super(sb, es, 0);
+                ext4_commit_super(sb, 0);
                return;
        }
        ext4_unlock_group(sb, grp);
@@ -467,7 +483,6 @@ __acquires(bitlock)
        return;
 }
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -496,7 +511,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 /*
 * Open the external journal device
 */
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 {
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
@@ -507,7 +522,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
        return bdev;
 fail:
-        printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
+        ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
                        __bdevname(dev, b), PTR_ERR(bdev));
        return NULL;
 }
@@ -543,8 +558,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 {
        struct list_head *l;
-        printk(KERN_ERR "sb orphan head is %d\n",
+        ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
-               le32_to_cpu(sbi->s_es->s_last_orphan));
+                 le32_to_cpu(sbi->s_es->s_last_orphan));
        printk(KERN_ERR "sb_info orphan list:\n");
        list_for_each(l, &sbi->s_orphan) {
@@ -563,6 +578,12 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        lock_super(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                ext4_commit_super(sb, 1);
+        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
@@ -576,7 +597,7 @@ static void ext4_put_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
        }
        if (sbi->s_proc) {
                remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -586,7 +607,10 @@ static void ext4_put_super(struct super_block *sb)
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
        kfree(sbi->s_group_desc);
-        kfree(sbi->s_flex_groups);
+        if (is_vmalloc_addr(sbi->s_flex_groups))
+                vfree(sbi->s_flex_groups);
+        else
+                kfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -625,11 +649,8 @@ static void ext4_put_super(struct super_block *sb)
        unlock_super(sb);
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
-        lock_super(sb);
-        lock_kernel();
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        return;
 }
 static struct kmem_cache *ext4_inode_cachep;
@@ -644,10 +665,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        ei->i_acl = EXT4_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
        ei->vfs_inode.i_version = 1;
        ei->vfs_inode.i_data.writeback_index = 0;
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
@@ -664,14 +682,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_allocated_meta_blocks = 0;
        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
        return &ei->vfs_inode;
 }
 static void ext4_destroy_inode(struct inode *inode)
 {
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
-                printk("EXT4 Inode %p: orphan list check failed!\n",
+                ext4_msg(inode->i_sb, KERN_ERR,
-                        EXT4_I(inode));
+                         "Inode %lu (%p): orphan list check failed!",
+                         inode->i_ino, EXT4_I(inode));
                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
                                EXT4_I(inode), sizeof(struct ext4_inode_info),
                                true);
@@ -711,18 +731,6 @@ static void destroy_inodecache(void)
 static void ext4_clear_inode(struct inode *inode)
 {
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        if (EXT4_I(inode)->i_acl &&
-                        EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
-                posix_acl_release(EXT4_I(inode)->i_acl);
-                EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
-        }
-        if (EXT4_I(inode)->i_default_acl &&
-                        EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
-                posix_acl_release(EXT4_I(inode)->i_default_acl);
-                EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
-        }
-#endif
        ext4_discard_preallocations(inode);
        if (EXT4_JOURNAL(inode))
                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -870,12 +878,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noauto_da_alloc");
        ext4_show_quota_options(seq, sb);
        return 0;
 }
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
-                u64 ino, u32 generation)
+                                        u64 ino, u32 generation)
 {
        struct inode *inode;
@@ -904,14 +912,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 }
 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
-                int fh_len, int fh_type)
+                                        int fh_len, int fh_type)
 {
        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
 }
 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
-                int fh_len, int fh_type)
+                                        int fh_len, int fh_type)
 {
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
@@ -923,7 +931,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 * which would prevent try_to_free_buffers() from freeing them, we must use
 * jbd2 layer's try_to_free_buffers() function to release them.
 */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+                                 gfp_t wait)
 {
        journal_t *journal = EXT4_SB(sb)->s_journal;
@@ -992,7 +1001,6 @@ static const struct super_operations ext4_sops = {
        .dirty_inode    = ext4_dirty_inode,
        .delete_inode   = ext4_delete_inode,
        .put_super      = ext4_put_super,
-        .write_super    = ext4_write_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs      = ext4_freeze,
        .unfreeze_fs    = ext4_unfreeze,
@@ -1007,6 +1015,25 @@ static const struct super_operations ext4_sops = {
        .bdev_try_to_free_page = bdev_try_to_free_page,
 };
+static const struct super_operations ext4_nojournal_sops = {
+        .alloc_inode    = ext4_alloc_inode,
+        .destroy_inode  = ext4_destroy_inode,
+        .write_inode    = ext4_write_inode,
+        .dirty_inode    = ext4_dirty_inode,
+        .delete_inode   = ext4_delete_inode,
+        .write_super    = ext4_write_super,
+        .put_super      = ext4_put_super,
+        .statfs         = ext4_statfs,
+        .remount_fs     = ext4_remount,
+        .clear_inode    = ext4_clear_inode,
+        .show_options   = ext4_show_options,
+#ifdef CONFIG_QUOTA
+        .quota_read     = ext4_quota_read,
+        .quota_write    = ext4_quota_write,
+#endif
+        .bdev_try_to_free_page = bdev_try_to_free_page,
+};
 static const struct export_operations ext4_export_ops = {
        .fh_to_dentry = ext4_fh_to_dentry,
        .fh_to_parent = ext4_fh_to_parent,
@@ -1023,12 +1050,13 @@ enum {
        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-        Opt_data_err_abort, Opt_data_err_ignore,
+        Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1069,6 +1097,7 @@ static const match_table_t tokens = {
        {Opt_data_writeback, "data=writeback"},
        {Opt_data_err_abort, "data_err=abort"},
        {Opt_data_err_ignore, "data_err=ignore"},
+        {Opt_mb_history_length, "mb_history_length=%u"},
        {Opt_offusrjquota, "usrjquota="},
        {Opt_usrjquota, "usrjquota=%s"},
        {Opt_offgrpjquota, "grpjquota="},
@@ -1087,6 +1116,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+        {Opt_block_validity, "block_validity"},
+        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
        {Opt_journal_ioprio, "journal_ioprio=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1102,8 +1133,9 @@ static ext4_fsblk_t get_sb_block(void **data)
        if (!options || strncmp(options, "sb=", 3) != 0)
                return 1;       /* Default location */
        options += 3;
-        /*todo: use simple_strtoll with >32bit ext4 */
+        /* TODO: use simple_strtoll with >32bit ext4 */
        sb_block = simple_strtoul(options, &options, 0);
        if (*options && *options != ',') {
                printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1113,6 +1145,7 @@ static ext4_fsblk_t get_sb_block(void **data)
        if (*options == ',')
                options++;
        *data = (void *) options;
        return sb_block;
 }
@@ -1206,8 +1239,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_user_xattr:
                case Opt_nouser_xattr:
-                        printk(KERN_ERR "EXT4 (no)user_xattr options "
+                        ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
-                               "not supported\n");
                        break;
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1220,8 +1252,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_acl:
                case Opt_noacl:
-                        printk(KERN_ERR "EXT4 (no)acl options "
+                        ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
-                               "not supported\n");
                        break;
 #endif
                case Opt_journal_update:
@@ -1231,16 +1262,16 @@ static int parse_options(char *options, struct super_block *sb,
                           user to specify an existing inode to be the
                           journal file. */
                        if (is_remount) {
-                                printk(KERN_ERR "EXT4-fs: cannot specify "
+                                ext4_msg(sb, KERN_ERR,
-                                       "journal on remount\n");
+                                         "Cannot specify journal on remount");
                                return 0;
                        }
                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
-                                printk(KERN_ERR "EXT4-fs: cannot specify "
+                                ext4_msg(sb, KERN_ERR,
-                                       "journal on remount\n");
+                                        "Cannot specify journal on remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option))
@@ -1294,9 +1325,8 @@ static int parse_options(char *options, struct super_block *sb,
                        if (is_remount) {
                                if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
                                                != data_opt) {
-                                        printk(KERN_ERR
+                                        ext4_msg(sb, KERN_ERR,
-                                                "EXT4-fs: cannot change data "
+                                                "Cannot change data mode on remount");
-                                                "mode on remount\n");
                                        return 0;
                                }
                        } else {
@@ -1310,6 +1340,13 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_data_err_ignore:
                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
                        break;
+                case Opt_mb_history_length:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_mb_history_max = option;
+                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
                        qtype = USRQUOTA;
@@ -1319,31 +1356,31 @@ static int parse_options(char *options, struct super_block *sb,
 set_qf_name:
                        if (sb_any_quota_loaded(sb) &&
                            !sbi->s_qf_names[qtype]) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                       "EXT4-fs: Cannot change journaled "
+                                       "Cannot change journaled "
-                                       "quota options when quota turned on.\n");
+                                       "quota options when quota turned on");
                                return 0;
                        }
                        qname = match_strdup(&args[0]);
                        if (!qname) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: not enough memory for "
+                                        "Not enough memory for "
-                                        "storing quotafile name.\n");
+                                        "storing quotafile name");
                                return 0;
                        }
                        if (sbi->s_qf_names[qtype] &&
                            strcmp(sbi->s_qf_names[qtype], qname)) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: %s quota file already "
+                                        "%s quota file already "
-                                        "specified.\n", QTYPE2NAME(qtype));
+                                        "specified", QTYPE2NAME(qtype));
                                kfree(qname);
                                return 0;
                        }
                        sbi->s_qf_names[qtype] = qname;
                        if (strchr(sbi->s_qf_names[qtype], '/')) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: quotafile must be on "
+                                        "quotafile must be on "
-                                        "filesystem root.\n");
+                                        "filesystem root");
                                kfree(sbi->s_qf_names[qtype]);
                                sbi->s_qf_names[qtype] = NULL;
                                return 0;
@@ -1358,9 +1395,9 @@ set_qf_name:
 clear_qf_name:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_qf_names[qtype]) {
-                                printk(KERN_ERR "EXT4-fs: Cannot change "
+                                ext4_msg(sb, KERN_ERR, "Cannot change "
                                        "journaled quota options when "
-                                        "quota turned on.\n");
+                                        "quota turned on");
                                return 0;
                        }
                        /*
@@ -1377,9 +1414,9 @@ clear_qf_name:
 set_qf_format:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_jquota_fmt != qfmt) {
-                                printk(KERN_ERR "EXT4-fs: Cannot change "
+                                ext4_msg(sb, KERN_ERR, "Cannot change "
                                        "journaled quota options when "
-                                        "quota turned on.\n");
+                                        "quota turned on");
                                return 0;
                        }
                        sbi->s_jquota_fmt = qfmt;
@@ -1395,8 +1432,8 @@ set_qf_format:
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
-                                printk(KERN_ERR "EXT4-fs: Cannot change quota "
+                                ext4_msg(sb, KERN_ERR, "Cannot change quota "
-                                        "options when quota turned on.\n");
+                                        "options when quota turned on");
                                return 0;
                        }
                        clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1407,8 +1444,8 @@ set_qf_format:
                case Opt_quota:
                case Opt_usrquota:
                case Opt_grpquota:
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                                "EXT4-fs: quota options not supported.\n");
+                                "quota options not supported");
                        break;
                case Opt_usrjquota:
                case Opt_grpjquota:
@@ -1416,15 +1453,14 @@ set_qf_format:
                case Opt_offgrpjquota:
                case Opt_jqfmt_vfsold:
                case Opt_jqfmt_vfsv0:
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                                "EXT4-fs: journaled quota options not "
+                                "journaled quota options not supported");
-                                "supported.\n");
                        break;
                case Opt_noquota:
                        break;
 #endif
                case Opt_abort:
-                        set_opt(sbi->s_mount_opt, ABORT);
+                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
                        break;
                case Opt_nobarrier:
                        clear_opt(sbi->s_mount_opt, BARRIER);
@@ -1443,8 +1479,9 @@ set_qf_format:
                        break;
                case Opt_resize:
                        if (!is_remount) {
-                                printk("EXT4-fs: resize option only available "
+                                ext4_msg(sb, KERN_ERR,
-                                        "for remount\n");
+                                        "resize option only available "
+                                        "for remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option) != 0)
@@ -1474,14 +1511,21 @@ set_qf_format:
                case Opt_delalloc:
                        set_opt(sbi->s_mount_opt, DELALLOC);
                        break;
+                case Opt_block_validity:
+                        set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        break;
+                case Opt_noblock_validity:
+                        clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
-                        if (option & (option - 1)) {
+                        if (!is_power_of_2(option)) {
-                                printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+                                ext4_msg(sb, KERN_ERR,
-                                       " must be a power of 2\n");
+                                         "EXT4-fs: inode_readahead_blks"
+                                         " must be a power of 2");
                                return 0;
                        }
                        sbi->s_inode_readahead_blks = option;
@@ -1508,9 +1552,9 @@ set_qf_format:
                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
                        break;
                default:
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: Unrecognized mount option \"%s\" "
+                               "Unrecognized mount option \"%s\" "
-                               "or missing value\n", p);
+                               "or missing value", p);
                        return 0;
                }
        }
@@ -1528,21 +1572,21 @@ set_qf_format:
                                (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
                    (sbi->s_qf_names[GRPQUOTA] &&
                                (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
-                        printk(KERN_ERR "EXT4-fs: old and new quota "
+                        ext4_msg(sb, KERN_ERR, "old and new quota "
-                                        "format mixing.\n");
+                                        "format mixing");
                        return 0;
                }
                if (!sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT4-fs: journaled quota format "
+                        ext4_msg(sb, KERN_ERR, "journaled quota format "
-                                        "not specified.\n");
+                                        "not specified");
                        return 0;
                }
        } else {
                if (sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT4-fs: journaled quota format "
+                        ext4_msg(sb, KERN_ERR, "journaled quota format "
                                        "specified with no journaling "
-                                        "enabled.\n");
+                                        "enabled");
                        return 0;
                }
        }
@@ -1557,32 +1601,32 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        int res = 0;
        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-                printk(KERN_ERR "EXT4-fs warning: revision level too high, "
+                ext4_msg(sb, KERN_ERR, "revision level too high, "
-                       "forcing read-only mode\n");
+                         "forcing read-only mode");
                res = MS_RDONLY;
        }
        if (read_only)
                return res;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
-                printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
+                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
        else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: mounting fs with errors, "
+                         "warning: mounting fs with errors, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: maximal mount count reached, "
+                         "warning: maximal mount count reached, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
                (le32_to_cpu(es->s_lastcheck) +
                        le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: checktime reached, "
+                         "warning: checktime reached, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
-        if (!sbi->s_journal) 
+        if (!sbi->s_journal)
                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1592,10 +1636,10 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        if (sbi->s_journal)
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-        ext4_commit_super(sb, es, 1);
+        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+                                "bpg=%lu, ipg=%lu, mo=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
@@ -1603,11 +1647,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        sbi->s_mount_opt);
        if (EXT4_SB(sb)->s_journal) {
-                printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+                ext4_msg(sb, KERN_INFO, "%s journal on %s",
-                       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+                       EXT4_SB(sb)->s_journal->j_inode ? "internal" :
                       "external", EXT4_SB(sb)->s_journal->j_devname);
        } else {
-                printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+                ext4_msg(sb, KERN_INFO, "no journal");
        }
        return res;
 }
@@ -1616,10 +1660,10 @@ static int ext4_fill_flex_info(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
-        struct buffer_head *bh;
        ext4_group_t flex_group_count;
        ext4_group_t flex_group;
        int groups_per_flex = 0;
+        size_t size;
        int i;
        if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,16 +1678,21 @@ static int ext4_fill_flex_info(struct super_block *sb)
        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
                        ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
                              EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-        sbi->s_flex_groups = kzalloc(flex_group_count *
+        size = flex_group_count * sizeof(struct flex_groups);
-                                     sizeof(struct flex_groups), GFP_KERNEL);
+        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+        if (sbi->s_flex_groups == NULL) {
+                sbi->s_flex_groups = vmalloc(size);
+                if (sbi->s_flex_groups)
+                        memset(sbi->s_flex_groups, 0, size);
+        }
        if (sbi->s_flex_groups == NULL) {
-                printk(KERN_ERR "EXT4-fs: not enough memory for "
+                ext4_msg(sb, KERN_ERR, "not enough memory for "
-                                "%u flex groups\n", flex_group_count);
+                                "%u flex groups", flex_group_count);
                goto failed;
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
-                gdp = ext4_get_group_desc(sb, i, &bh);
+                gdp = ext4_get_group_desc(sb, i, NULL);
                flex_group = ext4_flex_group(sbi, i);
                atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
@@ -1724,44 +1773,44 @@ static int ext4_check_descriptors(struct super_block *sb)
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Block bitmap for group %u not in group "
-                               "(block %llu)!\n", i, block_bitmap);
+                               "(block %llu)!", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode bitmap for group %u not in group "
-                               "(block %llu)!\n", i, inode_bitmap);
+                               "(block %llu)!", i, inode_bitmap);
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode table for group %u not in group "
-                               "(block %llu)!\n", i, inode_table);
+                               "(block %llu)!", i, inode_table);
                        return 0;
                }
-                spin_lock(sb_bgl_lock(sbi, i));
+                ext4_lock_group(sb, i);
                if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
-                               "Checksum for group %u failed (%u!=%u)\n",
+                                 "Checksum for group %u failed (%u!=%u)",
-                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+                                 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-                               gdp)), le16_to_cpu(gdp->bg_checksum));
+                                     gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!(sb->s_flags & MS_RDONLY)) {
-                                spin_unlock(sb_bgl_lock(sbi, i));
+                                ext4_unlock_group(sb, i);
                                return 0;
                        }
                }
-                spin_unlock(sb_bgl_lock(sbi, i));
+                ext4_unlock_group(sb, i);
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-        sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
        return 1;
 }
@@ -1796,8 +1845,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        }
        if (bdev_read_only(sb->s_bdev)) {
-                printk(KERN_ERR "EXT4-fs: write access "
+                ext4_msg(sb, KERN_ERR, "write access "
-                        "unavailable, skipping orphan cleanup.\n");
+                        "unavailable, skipping orphan cleanup");
                return;
        }
@@ -1811,8 +1860,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        }
        if (s_flags & MS_RDONLY) {
-                printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
+                ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
-                       sb->s_id);
                sb->s_flags &= ~MS_RDONLY;
        }
 #ifdef CONFIG_QUOTA
@@ -1823,9 +1871,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                if (EXT4_SB(sb)->s_qf_names[i]) {
                        int ret = ext4_quota_on_mount(sb, i);
                        if (ret < 0)
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: Cannot turn on journaled "
+                                        "Cannot turn on journaled "
-                                        "quota: error %d\n", ret);
+                                        "quota: error %d", ret);
                }
        }
 #endif
@@ -1842,16 +1890,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
                vfs_dq_init(inode);
                if (inode->i_nlink) {
-                        printk(KERN_DEBUG
+                        ext4_msg(sb, KERN_DEBUG,
-                                "%s: truncating inode %lu to %lld bytes\n",
+                                "%s: truncating inode %lu to %lld bytes",
                                __func__, inode->i_ino, inode->i_size);
                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
                        ext4_truncate(inode);
                        nr_truncates++;
                } else {
-                        printk(KERN_DEBUG
+                        ext4_msg(sb, KERN_DEBUG,
-                                "%s: deleting unreferenced inode %lu\n",
+                                "%s: deleting unreferenced inode %lu",
                                __func__, inode->i_ino);
                        jbd_debug(2, "deleting unreferenced inode %lu\n",
                                  inode->i_ino);
@@ -1863,11 +1911,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
        if (nr_orphans)
-                printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
+                ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
-                       sb->s_id, PLURAL(nr_orphans));
+                       PLURAL(nr_orphans));
        if (nr_truncates)
-                printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
+                ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
-                       sb->s_id, PLURAL(nr_truncates));
+                       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
@@ -1877,6 +1925,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
 /*
 * Maximal extent format file size.
 * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1894,7 +1943,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
        /* small i_blocks in vfs inode? */
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * CONFIG_LBD is not enabled implies the inode
+                 * CONFIG_LBDAF is not enabled implies the inode
                 * i_block represent total blocks in 512 bytes
                 * 32 == size of vfs inode i_blocks * 8
                 */
@@ -1927,19 +1976,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
        loff_t res = EXT4_NDIR_BLOCKS;
        int meta_blocks;
        loff_t upper_limit;
-        /* This is calculated to be the largest file size for a
+        /* This is calculated to be the largest file size for a dense, block
-         * dense, bitmapped file such that the total number of
+         * mapped file such that the file's total number of 512-byte sectors,
-         * sectors in the file, including data and all indirect blocks,
+         * including data and all indirect blocks, does not exceed (2^48 - 1).
-         * does not exceed 2^48 -1
+         *
-         * __u32 i_blocks_lo and _u16 i_blocks_high representing the
+         * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
-         * total number of  512 bytes blocks of the file
+         * number of 512-byte sectors of the file.
         */
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * !has_huge_files or CONFIG_LBD is not enabled
+                 * !has_huge_files or CONFIG_LBDAF not enabled implies that
-                 * implies the inode i_block represent total blocks in
+                 * the inode i_block field represents total file blocks in
-                 * 512 bytes 32 == size of vfs inode i_blocks * 8
+                 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
                 */
                upper_limit = (1LL << 32) - 1;
@@ -1981,7 +2030,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 }
 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
-                                ext4_fsblk_t logical_sb_block, int nr)
+                                   ext4_fsblk_t logical_sb_block, int nr)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t bg, first_meta_bg;
@@ -1995,6 +2044,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
        bg = sbi->s_desc_per_block * nr;
        if (ext4_bg_has_super(sb, bg))
                has_super = 1;
        return (has_super + ext4_group_first_block_no(sb, bg));
 }
@@ -2091,8 +2141,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
        if (parse_strtoul(buf, 0x40000000, &t))
                return -EINVAL;
-        /* inode_readahead_blks must be a power of 2 */
+        if (!is_power_of_2(t))
-        if (t & (t-1))
                return -EINVAL;
        sbi->s_inode_readahead_blks = t;
@@ -2100,7 +2149,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 }
 static ssize_t sbi_ui_show(struct ext4_attr *a,
-                                struct ext4_sb_info *sbi, char *buf)
+                           struct ext4_sb_info *sbi, char *buf)
 {
        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
@@ -2141,6 +2190,7 @@ EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
@@ -2153,6 +2203,7 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(session_write_kbytes),
        ATTR_LIST(lifetime_write_kbytes),
        ATTR_LIST(inode_readahead_blks),
+        ATTR_LIST(inode_goal),
        ATTR_LIST(mb_stats),
        ATTR_LIST(mb_max_to_scan),
        ATTR_LIST(mb_min_to_scan),
@@ -2205,7 +2256,6 @@ static struct kobj_type ext4_ktype = {
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
 {
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
@@ -2256,7 +2306,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
-                printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
                goto out_fail;
        }
@@ -2272,7 +2322,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (!(bh = sb_bread(sb, logical_sb_block))) {
-                printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
+                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                goto out_fail;
        }
        /*
@@ -2321,6 +2371,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+        sbi->s_mb_history_max = default_mb_history_length;
        set_opt(sbi->s_mount_opt, BARRIER);
@@ -2330,7 +2381,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        set_opt(sbi->s_mount_opt, DELALLOC);
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
@@ -2342,9 +2392,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
             EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: feature flags set on rev 0 fs, "
+                       "feature flags set on rev 0 fs, "
-                       "running e2fsck is recommended\n");
+                       "running e2fsck is recommended");
        /*
         * Check feature flags regardless of the revision level, since we
@@ -2353,16 +2403,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
        if (features) {
-                printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
+                ext4_msg(sb, KERN_ERR,
-                       "unsupported optional features (%x).\n", sb->s_id,
+                        "Couldn't mount because of "
+                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                goto failed_mount;
        }
        features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
        if (!(sb->s_flags & MS_RDONLY) && features) {
-                printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
+                ext4_msg(sb, KERN_ERR,
-                       "unsupported optional features (%x).\n", sb->s_id,
+                        "Couldn't mount RDWR because of "
+                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                goto failed_mount;
@@ -2372,13 +2424,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (has_huge_files) {
                /*
                 * Large file size enabled file system can only be
-                 * mount if kernel is build with CONFIG_LBD
+                 * mount if kernel is build with CONFIG_LBDAF
                 */
                if (sizeof(root->i_blocks) < sizeof(u64) &&
                                !(sb->s_flags & MS_RDONLY)) {
-                        printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+                        ext4_msg(sb, KERN_ERR, "Filesystem with huge "
                                        "files cannot be mounted read-write "
-                                        "without CONFIG_LBD.\n", sb->s_id);
+                                        "without CONFIG_LBDAF");
                        goto failed_mount;
                }
        }
@@ -2386,17 +2438,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
            blocksize > EXT4_MAX_BLOCK_SIZE) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
+                       "Unsupported filesystem blocksize %d", blocksize);
-                       blocksize, sb->s_id);
                goto failed_mount;
        }
        if (sb->s_blocksize != blocksize) {
                /* Validate the filesystem blocksize */
                if (!sb_set_blocksize(sb, blocksize)) {
-                        printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+                        ext4_msg(sb, KERN_ERR, "bad block size %d",
                                        blocksize);
                        goto failed_mount;
                }
@@ -2406,15 +2456,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                offset = do_div(logical_sb_block, blocksize);
                bh = sb_bread(sb, logical_sb_block);
                if (!bh) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: Can't read superblock on 2nd try.\n");
+                               "Can't read superblock on 2nd try");
                        goto failed_mount;
                }
                es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: Magic mismatch, very weird !\n");
+                               "Magic mismatch, very weird!");
                        goto failed_mount;
                }
        }
@@ -2432,30 +2482,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > blocksize)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: unsupported inode size: %d\n",
+                               "unsupported inode size: %d",
                               sbi->s_inode_size);
                        goto failed_mount;
                }
                if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
                        sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
        }
        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                    !is_power_of_2(sbi->s_desc_size)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: unsupported descriptor size %lu\n",
+                               "unsupported descriptor size %lu",
                               sbi->s_desc_size);
                        goto failed_mount;
                }
        } else
                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
        if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0)
                goto cantfind_ext4;
@@ -2466,6 +2519,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_mount_state = le16_to_cpu(es->s_state);
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2483,25 +2537,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                       "EXT4-fs: #blocks per group too big: %lu\n",
+                       "#blocks per group too big: %lu",
                       sbi->s_blocks_per_group);
                goto failed_mount;
        }
        if (sbi->s_inodes_per_group > blocksize * 8) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                       "EXT4-fs: #inodes per group too big: %lu\n",
+                       "#inodes per group too big: %lu",
                       sbi->s_inodes_per_group);
                goto failed_mount;
        }
        if (ext4_blocks_count(es) >
                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-                printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+                ext4_msg(sb, KERN_ERR, "filesystem"
-                        " too large to mount safely\n", sb->s_id);
+                        " too large to mount safely");
                if (sizeof(sector_t) < 8)
-                        printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
+                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
-                                        "enabled\n");
                goto failed_mount;
        }
@@ -2511,21 +2564,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* check blocks count against device size */
        blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
+                ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
-                       "exceeds size of device (%llu blocks)\n",
+                       "exceeds size of device (%llu blocks)",
                       ext4_blocks_count(es), blocks_count);
                goto failed_mount;
        }
-        /*
+        /*
-         * It makes no sense for the first data block to be beyond the end
+         * It makes no sense for the first data block to be beyond the end
-         * of the filesystem.
+         * of the filesystem.
-         */
+         */
-        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
-                       "block %u is beyond end of filesystem (%llu)\n",
+                         "block %u is beyond end of filesystem (%llu)",
-                       le32_to_cpu(es->s_first_data_block),
+                         le32_to_cpu(es->s_first_data_block),
-                       ext4_blocks_count(es));
+                         ext4_blocks_count(es));
                goto failed_mount;
        }
        blocks_count = (ext4_blocks_count(es) -
@@ -2533,9 +2586,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
-                printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+                ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
                       "(block count %llu, first data block %u, "
-                       "blocks per group %lu)\n", sbi->s_groups_count,
+                       "blocks per group %lu)", sbi->s_groups_count,
                       ext4_blocks_count(es),
                       le32_to_cpu(es->s_first_data_block),
                       EXT4_BLOCKS_PER_GROUP(sb));
@@ -2547,7 +2600,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
                                    GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
-                printk(KERN_ERR "EXT4-fs: not enough memory\n");
+                ext4_msg(sb, KERN_ERR, "not enough memory");
                goto failed_mount;
        }
@@ -2562,21 +2615,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                block = descriptor_loc(sb, logical_sb_block, i);
                sbi->s_group_desc[i] = sb_bread(sb, block);
                if (!sbi->s_group_desc[i]) {
-                        printk(KERN_ERR "EXT4-fs: "
+                        ext4_msg(sb, KERN_ERR,
-                               "can't read group descriptor %d\n", i);
+                               "can't read group descriptor %d", i);
                        db_count = i;
                        goto failed_mount2;
                }
        }
        if (!ext4_check_descriptors(sb)) {
-                printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                if (!ext4_fill_flex_info(sb)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: unable to initialize "
+                               "unable to initialize "
-                               "flex_bg meta info!\n");
+                               "flex_bg meta info!");
                        goto failed_mount2;
                }
@@ -2598,7 +2651,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
        }
        if (err) {
-                printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount3;
        }
@@ -2607,7 +2660,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /*
         * set up enough so that it can read an inode
         */
-        sb->s_op = &ext4_sops;
+        if (!test_opt(sb, NOLOAD) &&
+            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+                sb->s_op = &ext4_sops;
+        else
+                sb->s_op = &ext4_nojournal_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
@@ -2615,6 +2672,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->dq_op = &ext4_quota_operations;
 #endif
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+        mutex_init(&sbi->s_orphan_lock);
+        mutex_init(&sbi->s_resize_lock);
        sb->s_root = NULL;
@@ -2632,13 +2691,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount3;
                if (!(sb->s_flags & MS_RDONLY) &&
                    EXT4_SB(sb)->s_journal->j_failed_commit) {
-                        printk(KERN_CRIT "EXT4-fs error (device %s): "
+                        ext4_msg(sb, KERN_CRIT, "error: "
                               "ext4_fill_super: Journal transaction "
-                               "%u is corrupt\n", sb->s_id,
+                               "%u is corrupt",
                               EXT4_SB(sb)->s_journal->j_failed_commit);
                        if (test_opt(sb, ERRORS_RO)) {
-                                printk(KERN_CRIT
+                                ext4_msg(sb, KERN_CRIT,
-                                       "Mounting filesystem read-only\n");
+                                       "Mounting filesystem read-only");
                                sb->s_flags |= MS_RDONLY;
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2646,14 +2705,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        if (test_opt(sb, ERRORS_PANIC)) {
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                                ext4_commit_super(sb, es, 1);
+                                ext4_commit_super(sb, 1);
                                goto failed_mount4;
                        }
                }
        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
-                printk(KERN_ERR "EXT4-fs: required journal recovery "
+                ext4_msg(sb, KERN_ERR, "required journal recovery "
-                       "suppressed and not mounted read-only\n");
+                       "suppressed and not mounted read-only");
                goto failed_mount4;
        } else {
                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2666,7 +2725,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (ext4_blocks_count(es) > 0xffffffffULL &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
-                printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
+                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                goto failed_mount4;
        }
@@ -2704,8 +2763,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        case EXT4_MOUNT_WRITEBACK_DATA:
                if (!jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
-                        printk(KERN_ERR "EXT4-fs: Journal does not support "
+                        ext4_msg(sb, KERN_ERR, "Journal does not support "
-                               "requested data journaling mode\n");
+                               "requested data journaling mode");
                        goto failed_mount4;
                }
        default:
@@ -2717,8 +2776,8 @@ no_journal:
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-                        printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
+                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
-                                "its supported only with writeback mode\n");
+                                "its supported only with writeback mode");
                        clear_opt(sbi->s_mount_opt, NOBH);
                }
        }
@@ -2729,18 +2788,18 @@ no_journal:
        root = ext4_iget(sb, EXT4_ROOT_INO);
        if (IS_ERR(root)) {
-                printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+                ext4_msg(sb, KERN_ERR, "get root inode failed");
                ret = PTR_ERR(root);
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                iput(root);
-                printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                goto failed_mount4;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
-                printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
+                ext4_msg(sb, KERN_ERR, "get root dentry failed");
                iput(root);
                ret = -ENOMEM;
                goto failed_mount4;
@@ -2769,22 +2828,29 @@ no_journal:
                                                        sbi->s_inode_size) {
                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                                                       EXT4_GOOD_OLD_INODE_SIZE;
-                printk(KERN_INFO "EXT4-fs: required extra inode space not"
+                ext4_msg(sb, KERN_INFO, "required extra inode space not"
-                        "available.\n");
+                         "available");
        }
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-                printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
-                                "requested data journaling mode\n");
+                         "requested data journaling mode");
                clear_opt(sbi->s_mount_opt, DELALLOC);
        } else if (test_opt(sb, DELALLOC))
-                printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+                ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
+        err = ext4_setup_system_zone(sb);
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "failed to initialize system "
+                         "zone (%d)\n", err);
+                goto failed_mount4;
+        }
        ext4_ext_init(sb);
        err = ext4_mb_init(sb, needs_recovery);
        if (err) {
-                printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+                ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
-                       err);
+                         err);
                goto failed_mount4;
        }
@@ -2798,19 +2864,11 @@ no_journal:
                goto failed_mount4;
        };
-        /*
-         * akpm: core read_super() calls in here with the superblock locked.
-         * That deadlocks, because orphan cleanup needs to lock the superblock
-         * in numerous places.  Here we just pop the lock - it's relatively
-         * harmless, because we are now ready to accept write_super() requests,
-         * and aviro says that's the only reason for hanging onto the
-         * superblock lock.
-         */
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
        if (needs_recovery) {
-                printk(KERN_INFO "EXT4-fs: recovery complete.\n");
+                ext4_msg(sb, KERN_INFO, "recovery complete");
                ext4_mark_recovery_complete(sb, es);
        }
        if (EXT4_SB(sb)->s_journal) {
@@ -2823,25 +2881,30 @@ no_journal:
        } else
                descr = "out journal";
-        printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
+        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
-               sb->s_id, descr);
        lock_kernel();
        return 0;
 cantfind_ext4:
        if (!silent)
-                printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
+                ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
-                       sb->s_id);
        goto failed_mount;
 failed_mount4:
-        printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+        ext4_msg(sb, KERN_ERR, "mount failed");
+        ext4_release_system_zone(sb);
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
 failed_mount3:
+        if (sbi->s_flex_groups) {
+                if (is_vmalloc_addr(sbi->s_flex_groups))
+                        vfree(sbi->s_flex_groups);
+                else
+                        kfree(sbi->s_flex_groups);
+        }
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -2862,6 +2925,7 @@ failed_mount:
        brelse(bh);
 out_fail:
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
        return ret;
@@ -2906,27 +2970,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
        journal_inode = ext4_iget(sb, journal_inum);
        if (IS_ERR(journal_inode)) {
-                printk(KERN_ERR "EXT4-fs: no journal found.\n");
+                ext4_msg(sb, KERN_ERR, "no journal found");
                return NULL;
        }
        if (!journal_inode->i_nlink) {
                make_bad_inode(journal_inode);
                iput(journal_inode);
-                printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+                ext4_msg(sb, KERN_ERR, "journal inode is deleted");
                return NULL;
        }
        jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
                  journal_inode, journal_inode->i_size);
        if (!S_ISREG(journal_inode->i_mode)) {
-                printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+                ext4_msg(sb, KERN_ERR, "invalid journal inode");
                iput(journal_inode);
                return NULL;
        }
        journal = jbd2_journal_init_inode(journal_inode);
        if (!journal) {
-                printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+                ext4_msg(sb, KERN_ERR, "Could not load journal inode");
                iput(journal_inode);
                return NULL;
        }
@@ -2950,22 +3014,22 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
-        bdev = ext4_blkdev_get(j_dev);
+        bdev = ext4_blkdev_get(j_dev, sb);
        if (bdev == NULL)
                return NULL;
        if (bd_claim(bdev, sb)) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                        "EXT4-fs: failed to claim external journal device.\n");
+                        "failed to claim external journal device");
                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
                return NULL;
        }
        blocksize = sb->s_blocksize;
-        hblock = bdev_hardsect_size(bdev);
+        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                        "EXT4-fs: blocksize too small for journal device.\n");
+                        "blocksize too small for journal device");
                goto out_bdev;
        }
@@ -2973,8 +3037,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
        set_blocksize(bdev, blocksize);
        if (!(bh = __bread(bdev, sb_block, blocksize))) {
-                printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
+                ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
-                       "external journal\n");
+                       "external journal");
                goto out_bdev;
        }
@@ -2982,14 +3046,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-                printk(KERN_ERR "EXT4-fs: external journal has "
+                ext4_msg(sb, KERN_ERR, "external journal has "
-                                        "bad superblock\n");
+                                        "bad superblock");
                brelse(bh);
                goto out_bdev;
        }
        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-                printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+                ext4_msg(sb, KERN_ERR, "journal UUID does not match");
                brelse(bh);
                goto out_bdev;
        }
@@ -3001,25 +3065,26 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
                                        start, len, blocksize);
        if (!journal) {
-                printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+                ext4_msg(sb, KERN_ERR, "failed to create device journal");
                goto out_bdev;
        }
        journal->j_private = sb;
        ll_rw_block(READ, 1, &journal->j_sb_buffer);
        wait_on_buffer(journal->j_sb_buffer);
        if (!buffer_uptodate(journal->j_sb_buffer)) {
-                printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+                ext4_msg(sb, KERN_ERR, "I/O error on journal device");
                goto out_journal;
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-                printk(KERN_ERR "EXT4-fs: External journal has more than one "
+                ext4_msg(sb, KERN_ERR, "External journal has more than one "
-                                        "user (unsupported) - %d\n",
+                                        "user (unsupported) - %d",
                        be32_to_cpu(journal->j_superblock->s_nr_users));
                goto out_journal;
        }
        EXT4_SB(sb)->journal_bdev = bdev;
        ext4_init_journal_params(sb, journal);
        return journal;
 out_journal:
        jbd2_journal_destroy(journal);
 out_bdev:
@@ -3041,8 +3106,8 @@ static int ext4_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-                printk(KERN_INFO "EXT4-fs: external journal device major/minor "
+                ext4_msg(sb, KERN_INFO, "external journal device major/minor "
-                        "numbers have changed\n");
+                        "numbers have changed");
                journal_dev = new_decode_dev(journal_devnum);
        } else
                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3054,24 +3119,23 @@ static int ext4_load_journal(struct super_block *sb,
         * crash?  For recovery, we need to check in advance whether we
         * can get read-write access to the device.
         */
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
                if (sb->s_flags & MS_RDONLY) {
-                        printk(KERN_INFO "EXT4-fs: INFO: recovery "
+                        ext4_msg(sb, KERN_INFO, "INFO: recovery "
-                                        "required on readonly filesystem.\n");
+                                        "required on readonly filesystem");
                        if (really_read_only) {
-                                printk(KERN_ERR "EXT4-fs: write access "
+                                ext4_msg(sb, KERN_ERR, "write access "
-                                        "unavailable, cannot proceed.\n");
+                                        "unavailable, cannot proceed");
                                return -EROFS;
                        }
-                        printk(KERN_INFO "EXT4-fs: write access will "
+                        ext4_msg(sb, KERN_INFO, "write access will "
-                               "be enabled during recovery.\n");
+                               "be enabled during recovery");
                }
        }
        if (journal_inum && journal_dev) {
-                printk(KERN_ERR "EXT4-fs: filesystem has both journal "
+                ext4_msg(sb, KERN_ERR, "filesystem has both journal "
-                       "and inode journals!\n");
+                       "and inode journals!");
                return -EINVAL;
        }
@@ -3084,14 +3148,14 @@ static int ext4_load_journal(struct super_block *sb,
        }
        if (journal->j_flags & JBD2_BARRIER)
-                printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+                ext4_msg(sb, KERN_INFO, "barriers enabled");
        else
-                printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+                ext4_msg(sb, KERN_INFO, "barriers disabled");
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
                err = jbd2_journal_update_format(journal);
                if (err)  {
-                        printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+                        ext4_msg(sb, KERN_ERR, "error updating journal");
                        jbd2_journal_destroy(journal);
                        return err;
                }
@@ -3103,7 +3167,7 @@ static int ext4_load_journal(struct super_block *sb,
                err = jbd2_journal_load(journal);
        if (err) {
-                printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+                ext4_msg(sb, KERN_ERR, "error loading journal");
                jbd2_journal_destroy(journal);
                return err;
        }
@@ -3114,18 +3178,17 @@ static int ext4_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
-                sb->s_dirt = 1;
                /* Make sure we flush the recovery flag to disk. */
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
        }
        return 0;
 }
-static int ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb, int sync)
-                              struct ext4_super_block *es, int sync)
 {
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;
@@ -3140,8 +3203,8 @@ static int ext4_commit_super(struct super_block *sb,
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-                printk(KERN_ERR "EXT4-fs: previous I/O error to "
+                ext4_msg(sb, KERN_ERR, "previous I/O error to "
-                       "superblock detected for %s.\n", sb->s_id);
+                       "superblock detected");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
@@ -3154,7 +3217,7 @@ static int ext4_commit_super(struct super_block *sb,
                                        &EXT4_SB(sb)->s_freeblocks_counter));
        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeinodes_counter));
+        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
        if (sync) {
@@ -3164,8 +3227,8 @@ static int ext4_commit_super(struct super_block *sb,
                error = buffer_write_io_error(sbh);
                if (error) {
-                        printk(KERN_ERR "EXT4-fs: I/O error while writing "
+                        ext4_msg(sb, KERN_ERR, "I/O error while writing "
-                               "superblock for %s.\n", sb->s_id);
+                               "superblock");
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }
@@ -3173,7 +3236,6 @@ static int ext4_commit_super(struct super_block *sb,
        return error;
 }
 /*
 * Have we just finished recovery?  If so, and if we are mounting (or
 * remounting) the filesystem readonly, then we will end up with a
@@ -3192,14 +3254,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
        if (jbd2_journal_flush(journal) < 0)
                goto out;
-        lock_super(sb);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-                sb->s_dirt = 0;
+                ext4_commit_super(sb, 1);
-                ext4_commit_super(sb, es, 1);
        }
-        unlock_super(sb);
 out:
        jbd2_journal_unlock_updates(journal);
@@ -3238,7 +3297,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
                jbd2_journal_clear_err(journal);
        }
@@ -3257,29 +3316,17 @@ int ext4_force_commit(struct super_block *sb)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        if (journal) {
+        if (journal)
-                sb->s_dirt = 0;
                ret = ext4_journal_force_commit(journal);
-        }
        return ret;
 }
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
- */
 static void ext4_write_super(struct super_block *sb)
 {
-        if (EXT4_SB(sb)->s_journal) {
+        lock_super(sb);
-                if (mutex_trylock(&sb->s_lock) != 0)
+        ext4_commit_super(sb, 1);
-                        BUG();
+        unlock_super(sb);
-                sb->s_dirt = 0;
-        } else {
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-        }
 }
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3287,17 +3334,10 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
        int ret = 0;
        tid_t target;
-        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
+        trace_ext4_sync_fs(sb, wait);
-        sb->s_dirt = 0;
+        if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
-        if (EXT4_SB(sb)->s_journal) {
+                if (wait)
-                if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
+                        jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
-                                              &target)) {
-                        if (wait)
-                                jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
-                                                     target);
-                }
-        } else {
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
        }
        return ret;
 }
@@ -3310,34 +3350,32 @@ static int ext4_freeze(struct super_block *sb)
 {
        int error = 0;
        journal_t *journal;
-        sb->s_dirt = 0;
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (sb->s_flags & MS_RDONLY)
-                journal = EXT4_SB(sb)->s_journal;
+                return 0;
-                if (journal) {
+        journal = EXT4_SB(sb)->s_journal;
-                        /* Now we set up the journal barrier. */
-                        jbd2_journal_lock_updates(journal);
-                        /*
+        /* Now we set up the journal barrier. */
-                         * We don't want to clear needs_recovery flag when we
+        jbd2_journal_lock_updates(journal);
-                         * failed to flush the journal.
-                         */
-                        error = jbd2_journal_flush(journal);
-                        if (error < 0)
-                                goto out;
-                }
-                /* Journal blocked and flushed, clear needs_recovery flag. */
+        /*
-                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+         * Don't clear the needs_recovery flag if we failed to flush
-                error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+         * the journal.
-                if (error)
+         */
-                        goto out;
+        error = jbd2_journal_flush(journal);
+        if (error < 0) {
+        out:
+                jbd2_journal_unlock_updates(journal);
+                return error;
        }
+        /* Journal blocked and flushed, clear needs_recovery flag. */
+        EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        error = ext4_commit_super(sb, 1);
+        if (error)
+                goto out;
        return 0;
-out:
-        jbd2_journal_unlock_updates(journal);
-        return error;
 }
 /*
@@ -3346,14 +3384,15 @@ out:
 */
 static int ext4_unfreeze(struct super_block *sb)
 {
-        if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
+        if (sb->s_flags & MS_RDONLY)
-                lock_super(sb);
+                return 0;
-                /* Reser the needs_recovery flag before the fs is unlocked. */
-                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        lock_super(sb);
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+        /* Reset the needs_recovery flag before the fs is unlocked. */
-                unlock_super(sb);
+        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+        ext4_commit_super(sb, 1);
-        }
+        unlock_super(sb);
+        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        return 0;
 }
@@ -3371,7 +3410,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        int i;
 #endif
+        lock_kernel();
        /* Store the original options */
+        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_resuid = sbi->s_resuid;
@@ -3396,7 +3438,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                goto restore_opts;
        }
-        if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
+        if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
                ext4_abort(sb, __func__, "Abort forced by user");
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -3411,7 +3453,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
                n_blocks_count > ext4_blocks_count(es)) {
-                if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+                if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
                        err = -EROFS;
                        goto restore_opts;
                }
@@ -3432,22 +3474,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                            (sbi->s_mount_state & EXT4_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                        /*
+                        if (sbi->s_journal)
-                         * We have to unlock super so that we can wait for
-                         * transactions.
-                         */
-                        if (sbi->s_journal) {
-                                unlock_super(sb);
                                ext4_mark_recovery_complete(sb, es);
-                                lock_super(sb);
-                        }
                } else {
                        int ret;
                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+                                ext4_msg(sb, KERN_WARNING, "couldn't "
                                       "remount RDWR because of unsupported "
-                                       "optional features (%x).\n", sb->s_id,
+                                       "optional features (%x)",
                                (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                                err = -EROFS;
@@ -3456,17 +3491,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        /*
                         * Make sure the group descriptor checksums
-                         * are sane.  If they aren't, refuse to
+                         * are sane.  If they aren't, refuse to remount r/w.
-                         * remount r/w.
                         */
                        for (g = 0; g < sbi->s_groups_count; g++) {
                                struct ext4_group_desc *gdp =
                                        ext4_get_group_desc(sb, g, NULL);
                                if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
-                                        printk(KERN_ERR
+                                        ext4_msg(sb, KERN_ERR,
-               "EXT4-fs: ext4_remount: "
+               "ext4_remount: Checksum for group %u failed (%u!=%u)",
-                "Checksum for group %u failed (%u!=%u)\n",
                g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EINVAL;
@@ -3480,11 +3513,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * require a full umount/remount for now.
                         */
                        if (es->s_last_orphan) {
-                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+                                ext4_msg(sb, KERN_WARNING, "Couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
-                                       "umount/remount instead.\n",
+                                       "umount/remount instead");
-                                       sb->s_id);
                                err = -EINVAL;
                                goto restore_opts;
                        }
@@ -3504,8 +3536,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                sb->s_flags &= ~MS_RDONLY;
                }
        }
+        ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
@@ -3514,7 +3547,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
                        kfree(old_opts.s_qf_names[i]);
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3532,6 +3568,8 @@ restore_opts:
                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
        }
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return err;
 }
@@ -3545,9 +3583,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
        } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-                ext4_group_t ngroups = sbi->s_groups_count, i;
+                ext4_group_t i, ngroups = ext4_get_groups_count(sb);
                ext4_fsblk_t overhead = 0;
-                smp_rmb();
                /*
                 * Compute the overhead (FS structures).  This is constant
@@ -3599,11 +3636,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
        return 0;
 }
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
+/* Helper function for writing quotas on sync - we need to start transaction
- * is locked for write. Otherwise the are possible deadlocks:
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
 * Process 1                         Process 2
 * ext4_create()                     quota_sync()
 *   jbd2_journal_start()                  write_dquot()
@@ -3627,7 +3665,7 @@ static int ext4_write_dquot(struct dquot *dquot)
        inode = dquot_to_inode(dquot);
        handle = ext4_journal_start(inode,
-                                        EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+                                    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit(dquot);
@@ -3643,7 +3681,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
        handle_t *handle;
        handle = ext4_journal_start(dquot_to_inode(dquot),
-                                        EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+                                    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_acquire(dquot);
@@ -3659,7 +3697,7 @@ static int ext4_release_dquot(struct dquot *dquot)
        handle_t *handle;
        handle = ext4_journal_start(dquot_to_inode(dquot),
-                                        EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+                                    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle)) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
@@ -3707,7 +3745,7 @@ static int ext4_write_info(struct super_block *sb, int type)
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
        return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-                        EXT4_SB(sb)->s_jquota_fmt, type);
+                                  EXT4_SB(sb)->s_jquota_fmt, type);
 }
 /*
@@ -3738,9 +3776,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
                if (path.dentry->d_parent != sb->s_root)
-                        printk(KERN_WARNING
+                        ext4_msg(sb, KERN_WARNING,
-                                "EXT4-fs: Quota file not on filesystem root. "
+                                "Quota file not on filesystem root. "
-                                "Journaled quota will not work.\n");
+                                "Journaled quota will not work");
        }
        /*
@@ -3823,8 +3861,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        handle_t *handle = journal_current_handle();
        if (EXT4_SB(sb)->s_journal && !handle) {
-                printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
+                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
-                        " cancelled because transaction is not started.\n",
+                        " cancelled because transaction is not started",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
@@ -3878,10 +3916,10 @@ out:
 #endif
-static int ext4_get_sb(struct file_system_type *fs_type,
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 static struct file_system_type ext4_fs_type = {
@@ -3893,14 +3931,14 @@ static struct file_system_type ext4_fs_type = {
 };
 #ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type,
+static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+                          const char *dev_name, void *data,struct vfsmount *mnt)
 {
-        printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
+        printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
-               "to mount using ext4\n");
+               "to mount using ext4\n", dev_name);
-        printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
+        printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
-               "will go away by 2.6.31\n");
+               "will go away by 2.6.31\n", dev_name);
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 static struct file_system_type ext4dev_fs_type = {
@@ -3917,13 +3955,16 @@ static int __init init_ext4_fs(void)
 {
        int err;
+        err = init_ext4_system_zone();
+        if (err)
+                return err;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-                return -ENOMEM;
+                goto out4;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
        err = init_ext4_mballoc();
        if (err)
-                return err;
+                goto out3;
        err = init_ext4_xattr();
        if (err)
@@ -3948,6 +3989,11 @@ out1:
        exit_ext4_xattr();
 out2:
        exit_ext4_mballoc();
+out3:
+        remove_proc_entry("fs/ext4", NULL);
+        kset_unregister(ext4_kset);
+out4:
+        exit_ext4_system_zone();
        return err;
 }
@@ -3962,6 +4008,7 @@ static void __exit exit_ext4_fs(void)
        exit_ext4_mballoc();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
+        exit_ext4_system_zone();
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index b42602298087..923990e4f16e 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -241,7 +241,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
        while (*fclus < cluster) {
                /* prevent the infinite loop of cluster chain */
                if (*fclus > limit) {
-                        fat_fs_panic(sb, "%s: detected the cluster chain loop"
+                        fat_fs_error(sb, "%s: detected the cluster chain loop"
                                     " (i_pos %lld)", __func__,
                                     MSDOS_I(inode)->i_pos);
                        nr = -EIO;
@@ -252,7 +252,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
                if (nr < 0)
                        goto out;
                else if (nr == FAT_ENT_FREE) {
-                        fat_fs_panic(sb, "%s: invalid cluster chain"
+                        fat_fs_error(sb, "%s: invalid cluster chain"
                                     " (i_pos %lld)", __func__,
                                     MSDOS_I(inode)->i_pos);
                        nr = -EIO;
@@ -285,7 +285,7 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
        if (ret < 0)
                return ret;
        else if (ret == FAT_ENT_EOF) {
-                fat_fs_panic(sb, "%s: request beyond EOF (i_pos %lld)",
+                fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)",
                             __func__, MSDOS_I(inode)->i_pos);
                return -EIO;
        }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 3a7f603b6982..530b4ca01510 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -16,12 +16,24 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
 #include "fat.h"
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE      ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS       ((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE        (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
 static inline loff_t fat_make_i_pos(struct super_block *sb,
                                    struct buffer_head *bh,
                                    struct msdos_dir_entry *de)
@@ -171,7 +183,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
                                unsigned char *buf, int size)
 {
        if (sbi->options.utf8)
-                return utf8_wcstombs(buf, uni, size);
+                return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
+                                UTF16_HOST_ENDIAN, buf, size);
        else
                return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
                                   sbi->nls_io);
@@ -325,19 +338,6 @@ parse_long:
 }
 /*
- * Maximum buffer size of short name.
- * [(MSDOS_NAME + '.') * max one char + nul]
- * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
- */
-#define FAT_MAX_SHORT_SIZE      ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
-/*
- * Maximum buffer size of unicode chars from slots.
- * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
- */
-#define FAT_MAX_UNI_CHARS       ((MSDOS_SLOTS - 1) * 13 + 1)
-#define FAT_MAX_UNI_SIZE        (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
-/*
 * Return values: negative -> error, 0 -> not found, positive -> found,
 * value is the total amount of slots, including the shortname entry.
 */
@@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
-        .fsync          = file_fsync,
+        .fsync          = fat_file_fsync,
 };
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
@@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
                        de++;
                        nr_slots--;
                }
-                mark_buffer_dirty(bh);
+                mark_buffer_dirty_inode(bh, dir);
                if (IS_DIRSYNC(dir))
                        err = sync_dirty_buffer(bh);
                brelse(bh);
@@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
                de--;
                nr_slots--;
        }
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        if (IS_DIRSYNC(dir))
                err = sync_dirty_buffer(bh);
        brelse(bh);
@@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
                }
                memset(bhs[n]->b_data, 0, sb->s_blocksize);
                set_buffer_uptodate(bhs[n]);
-                mark_buffer_dirty(bhs[n]);
+                mark_buffer_dirty_inode(bhs[n], dir);
                n++;
                blknr++;
@@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
        de[0].size = de[1].size = 0;
        memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
        set_buffer_uptodate(bhs[0]);
-        mark_buffer_dirty(bhs[0]);
+        mark_buffer_dirty_inode(bhs[0], dir);
        err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
        if (err)
@@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
                        slots += copy;
                        size -= copy;
                        set_buffer_uptodate(bhs[n]);
-                        mark_buffer_dirty(bhs[n]);
+                        mark_buffer_dirty_inode(bhs[n], dir);
                        if (!size)
                                break;
                        n++;
@@ -1293,7 +1293,7 @@ found:
                for (i = 0; i < long_bhs; i++) {
                        int copy = min_t(int, sb->s_blocksize - offset, size);
                        memcpy(bhs[i]->b_data + offset, slots, copy);
-                        mark_buffer_dirty(bhs[i]);
+                        mark_buffer_dirty_inode(bhs[i], dir);
                        offset = 0;
                        slots += copy;
                        size -= copy;
@@ -1304,7 +1304,7 @@ found:
                        /* Fill the short name slot. */
                        int copy = min_t(int, sb->s_blocksize - offset, size);
                        memcpy(bhs[i]->b_data + offset, slots, copy);
-                        mark_buffer_dirty(bhs[i]);
+                        mark_buffer_dirty_inode(bhs[i], dir);
                        if (IS_DIRSYNC(dir))
                                err = sync_dirty_buffer(bhs[i]);
                }
@@ -1334,7 +1334,7 @@ found:
                        goto error_remove;
                }
                if (dir->i_size & (sbi->cluster_size - 1)) {
-                        fat_fs_panic(sb, "Odd directory size");
+                        fat_fs_error(sb, "Odd directory size");
                        dir->i_size = (dir->i_size + sbi->cluster_size - 1)
                                & ~((loff_t)sbi->cluster_size - 1);
                }
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index ea440d65819c..adb0e72a176d 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -17,6 +17,10 @@
 #define VFAT_SFN_CREATE_WIN95   0x0100 /* emulate win95 rule for create */
 #define VFAT_SFN_CREATE_WINNT   0x0200 /* emulate winnt rule for create */
+#define FAT_ERRORS_CONT         1      /* ignore error and continue */
+#define FAT_ERRORS_PANIC        2      /* panic on error */
+#define FAT_ERRORS_RO           3      /* remount r/o on error */
 struct fat_mount_options {
        uid_t fs_uid;
        gid_t fs_gid;
@@ -26,6 +30,7 @@ struct fat_mount_options {
        char *iocharset;          /* Charset used for filename input/display */
        unsigned short shortname; /* flags for shortname display/create rule */
        unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+        unsigned char errors;     /* On error: continue, panic, remount-ro */
        unsigned short allow_utime;/* permission for setting the [am]time */
        unsigned quiet:1,         /* set = fake successful chmods and chowns */
                 showexec:1,      /* set = only set x bit for com/exe/bat */
@@ -74,6 +79,7 @@ struct msdos_sb_info {
        int fatent_shift;
        struct fatent_operations *fatent_ops;
+        struct inode *fat_inode;
        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];
@@ -251,6 +257,7 @@ struct fat_entry {
        } u;
        int nr_bhs;
        struct buffer_head *bhs[2];
+        struct inode *fat_inode;
 };
 static inline void fatent_init(struct fat_entry *fatent)
@@ -259,6 +266,7 @@ static inline void fatent_init(struct fat_entry *fatent)
        fatent->entry = 0;
        fatent->u.ent32_p = NULL;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
+        fatent->fat_inode = NULL;
 }
 static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
@@ -275,6 +283,7 @@ static inline void fatent_brelse(struct fat_entry *fatent)
                brelse(fatent->bhs[i]);
        fatent->nr_bhs = 0;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
+        fatent->fat_inode = NULL;
 }
 extern void fat_ent_access_init(struct super_block *sb);
@@ -296,6 +305,8 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate(struct inode *inode);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
                       struct kstat *stat);
+extern int fat_file_fsync(struct file *file, struct dentry *dentry,
+                          int datasync);
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -310,7 +321,7 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3))) __cold;
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index da6eea47872f..a81037721a6f 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -73,6 +73,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
        struct buffer_head **bhs = fatent->bhs;
        WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
        bhs[0] = sb_bread(sb, blocknr);
        if (!bhs[0])
                goto err;
@@ -103,6 +105,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
        struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
        WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
        fatent->bhs[0] = sb_bread(sb, blocknr);
        if (!fatent->bhs[0]) {
                printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
@@ -167,9 +170,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
        }
        spin_unlock(&fat12_entry_lock);
-        mark_buffer_dirty(fatent->bhs[0]);
+        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
        if (fatent->nr_bhs == 2)
-                mark_buffer_dirty(fatent->bhs[1]);
+                mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
 }
 static void fat16_ent_put(struct fat_entry *fatent, int new)
@@ -178,7 +181,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
                new = EOF_FAT16;
        *fatent->u.ent16_p = cpu_to_le16(new);
-        mark_buffer_dirty(fatent->bhs[0]);
+        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 static void fat32_ent_put(struct fat_entry *fatent, int new)
@@ -189,7 +192,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new)
        WARN_ON(new & 0xf0000000);
        new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
        *fatent->u.ent32_p = cpu_to_le32(new);
-        mark_buffer_dirty(fatent->bhs[0]);
+        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 static int fat12_ent_next(struct fat_entry *fatent)
@@ -345,7 +348,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
        if (entry < FAT_START_ENT || sbi->max_cluster <= entry) {
                fatent_brelse(fatent);
-                fat_fs_panic(sb, "invalid access to FAT (entry 0x%08x)", entry);
+                fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
                return -EIO;
        }
@@ -381,7 +384,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
                        }
                        memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
                        set_buffer_uptodate(c_bh);
-                        mark_buffer_dirty(c_bh);
+                        mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
                        if (sb->s_flags & MS_SYNCHRONOUS)
                                err = sync_dirty_buffer(c_bh);
                        brelse(c_bh);
@@ -557,7 +560,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
                        err = cluster;
                        goto error;
                } else if (cluster == FAT_ENT_FREE) {
-                        fat_fs_panic(sb, "%s: deleting FAT entry beyond EOF",
+                        fat_fs_error(sb, "%s: deleting FAT entry beyond EOF",
                                     __func__);
                        err = -EIO;
                        goto error;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 0a7f4a9918b3..f042b965c95c 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -18,106 +18,112 @@
 #include <linux/security.h>
 #include "fat.h"
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
+static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
-                      unsigned int cmd, unsigned long arg)
+{
+        u32 attr;
+        mutex_lock(&inode->i_mutex);
+        attr = fat_make_attrs(inode);
+        mutex_unlock(&inode->i_mutex);
+        return put_user(attr, user_attr);
+}
+static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
-        u32 __user *user_attr = (u32 __user *)arg;
+        int is_dir = S_ISDIR(inode->i_mode);
+        u32 attr, oldattr;
+        struct iattr ia;
+        int err;
-        switch (cmd) {
+        err = get_user(attr, user_attr);
-        case FAT_IOCTL_GET_ATTRIBUTES:
+        if (err)
-        {
+                goto out;
-                u32 attr;
-                mutex_lock(&inode->i_mutex);
+        mutex_lock(&inode->i_mutex);
-                attr = fat_make_attrs(inode);
+        err = mnt_want_write(file->f_path.mnt);
-                mutex_unlock(&inode->i_mutex);
+        if (err)
+                goto out_unlock_inode;
-                return put_user(attr, user_attr);
+        /*
+         * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
+         * prevents the user from turning us into a VFAT
+         * longname entry.  Also, we obviously can't set
+         * any of the NTFS attributes in the high 24 bits.
+         */
+        attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
+        /* Merge in ATTR_VOLUME and ATTR_DIR */
+        attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
+                (is_dir ? ATTR_DIR : 0);
+        oldattr = fat_make_attrs(inode);
+        /* Equivalent to a chmod() */
+        ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+        ia.ia_ctime = current_fs_time(inode->i_sb);
+        if (is_dir)
+                ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+        else {
+                ia.ia_mode = fat_make_mode(sbi, attr,
+                        S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
        }
-        case FAT_IOCTL_SET_ATTRIBUTES:
-        {
-                u32 attr, oldattr;
-                int err, is_dir = S_ISDIR(inode->i_mode);
-                struct iattr ia;
-                err = get_user(attr, user_attr);
+        /* The root directory has no attributes */
-                if (err)
+        if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
-                        return err;
+                err = -EINVAL;
+                goto out_drop_write;
+        }
-                mutex_lock(&inode->i_mutex);
+        if (sbi->options.sys_immutable &&
+            ((attr | oldattr) & ATTR_SYS) &&
-                err = mnt_want_write(filp->f_path.mnt);
+            !capable(CAP_LINUX_IMMUTABLE)) {
-                if (err)
+                err = -EPERM;
-                        goto up_no_drop_write;
+                goto out_drop_write;
+        }
-                /*
-                 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
-                 * prevents the user from turning us into a VFAT
-                 * longname entry.  Also, we obviously can't set
-                 * any of the NTFS attributes in the high 24 bits.
-                 */
-                attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
-                /* Merge in ATTR_VOLUME and ATTR_DIR */
-                attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
-                        (is_dir ? ATTR_DIR : 0);
-                oldattr = fat_make_attrs(inode);
-                /* Equivalent to a chmod() */
-                ia.ia_valid = ATTR_MODE | ATTR_CTIME;
-                ia.ia_ctime = current_fs_time(inode->i_sb);
-                if (is_dir)
-                        ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
-                else {
-                        ia.ia_mode = fat_make_mode(sbi, attr,
-                                S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
-                }
-                /* The root directory has no attributes */
+        /*
-                if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
+         * The security check is questionable...  We single
-                        err = -EINVAL;
+         * out the RO attribute for checking by the security
-                        goto up;
+         * module, just because it maps to a file mode.
-                }
+         */
+        err = security_inode_setattr(file->f_path.dentry, &ia);
+        if (err)
+                goto out_drop_write;
-                if (sbi->options.sys_immutable) {
+        /* This MUST be done before doing anything irreversible... */
-                        if ((attr | oldattr) & ATTR_SYS) {
+        err = fat_setattr(file->f_path.dentry, &ia);
-                                if (!capable(CAP_LINUX_IMMUTABLE)) {
+        if (err)
-                                        err = -EPERM;
+                goto out_drop_write;
-                                        goto up;
-                                }
+        fsnotify_change(file->f_path.dentry, ia.ia_valid);
-                        }
+        if (sbi->options.sys_immutable) {
-                }
+                if (attr & ATTR_SYS)
+                        inode->i_flags |= S_IMMUTABLE;
+                else
+                        inode->i_flags &= S_IMMUTABLE;
+        }
-                /*
+        fat_save_attrs(inode, attr);
-                 * The security check is questionable...  We single
+        mark_inode_dirty(inode);
-                 * out the RO attribute for checking by the security
+out_drop_write:
-                 * module, just because it maps to a file mode.
+        mnt_drop_write(file->f_path.mnt);
-                 */
+out_unlock_inode:
-                err = security_inode_setattr(filp->f_path.dentry, &ia);
+        mutex_unlock(&inode->i_mutex);
-                if (err)
+out:
-                        goto up;
+        return err;
+}
-                /* This MUST be done before doing anything irreversible... */
-                err = fat_setattr(filp->f_path.dentry, &ia);
-                if (err)
-                        goto up;
-                fsnotify_change(filp->f_path.dentry, ia.ia_valid);
-                if (sbi->options.sys_immutable) {
-                        if (attr & ATTR_SYS)
-                                inode->i_flags |= S_IMMUTABLE;
-                        else
-                                inode->i_flags &= S_IMMUTABLE;
-                }
-                fat_save_attrs(inode, attr);
+int fat_generic_ioctl(struct inode *inode, struct file *filp,
-                mark_inode_dirty(inode);
+                      unsigned int cmd, unsigned long arg)
-up:
+{
-                mnt_drop_write(filp->f_path.mnt);
+        u32 __user *user_attr = (u32 __user *)arg;
-up_no_drop_write:
-                mutex_unlock(&inode->i_mutex);
+        switch (cmd) {
-                return err;
+        case FAT_IOCTL_GET_ATTRIBUTES:
-        }
+                return fat_ioctl_get_attributes(inode, user_attr);
+        case FAT_IOCTL_SET_ATTRIBUTES:
+                return fat_ioctl_set_attributes(filp, user_attr);
        default:
                return -ENOTTY; /* Inappropriate ioctl for device */
        }
@@ -128,11 +134,23 @@ static int fat_file_release(struct inode *inode, struct file *filp)
        if ((filp->f_mode & FMODE_WRITE) &&
             MSDOS_SB(inode->i_sb)->options.flush) {
                fat_flush_inodes(inode->i_sb, inode, NULL);
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
        return 0;
 }
+int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        int res, err;
+        res = simple_fsync(filp, dentry, datasync);
+        err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+        return res ? res : err;
+}
 const struct file_operations fat_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -142,7 +160,7 @@ const struct file_operations fat_file_operations = {
        .mmap           = generic_file_mmap,
        .release        = fat_file_release,
        .ioctl          = fat_generic_ioctl,
-        .fsync          = file_fsync,
+        .fsync          = fat_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -213,7 +231,7 @@ static int fat_free(struct inode *inode, int skip)
                        fatent_brelse(&fatent);
                        return 0;
                } else if (ret == FAT_ENT_FREE) {
-                        fat_fs_panic(sb,
+                        fat_fs_error(sb,
                                     "%s: invalid cluster chain (i_pos %lld)",
                                     __func__, MSDOS_I(inode)->i_pos);
                        ret = -EIO;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 296785a0dec8..8970d8c49bb0 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -76,7 +76,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
                return 0;
        if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
-                fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
+                fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)",
                        MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
                return -EIO;
        }
@@ -441,16 +441,35 @@ static void fat_clear_inode(struct inode *inode)
 static void fat_write_super(struct super_block *sb)
 {
+        lock_super(sb);
        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY))
                fat_clusters_flush(sb);
+        unlock_super(sb);
+}
+static int fat_sync_fs(struct super_block *sb, int wait)
+{
+        lock_super(sb);
+        fat_clusters_flush(sb);
+        sb->s_dirt = 0;
+        unlock_super(sb);
+        return 0;
 }
 static void fat_put_super(struct super_block *sb)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                fat_write_super(sb);
+        iput(sbi->fat_inode);
        if (sbi->nls_disk) {
                unload_nls(sbi->nls_disk);
                sbi->nls_disk = NULL;
@@ -467,6 +486,8 @@ static void fat_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
        kfree(sbi);
+        unlock_kernel();
 }
 static struct kmem_cache *fat_inode_cachep;
@@ -632,6 +653,7 @@ static const struct super_operations fat_sops = {
        .delete_inode   = fat_delete_inode,
        .put_super      = fat_put_super,
        .write_super    = fat_write_super,
+        .sync_fs        = fat_sync_fs,
        .statfs         = fat_statfs,
        .clear_inode    = fat_clear_inode,
        .remount_fs     = fat_remount,
@@ -834,6 +856,12 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_puts(m, ",flush");
        if (opts->tz_utc)
                seq_puts(m, ",tz=UTC");
+        if (opts->errors == FAT_ERRORS_CONT)
+                seq_puts(m, ",errors=continue");
+        else if (opts->errors == FAT_ERRORS_PANIC)
+                seq_puts(m, ",errors=panic");
+        else
+                seq_puts(m, ",errors=remount-ro");
        return 0;
 }
@@ -846,7 +874,8 @@ enum {
        Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err,
+        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
+        Opt_err_panic, Opt_err_ro, Opt_err,
 };
 static const match_table_t fat_tokens = {
@@ -869,6 +898,11 @@ static const match_table_t fat_tokens = {
        {Opt_showexec, "showexec"},
        {Opt_debug, "debug"},
        {Opt_immutable, "sys_immutable"},
+        {Opt_flush, "flush"},
+        {Opt_tz_utc, "tz=UTC"},
+        {Opt_err_cont, "errors=continue"},
+        {Opt_err_panic, "errors=panic"},
+        {Opt_err_ro, "errors=remount-ro"},
        {Opt_obsolate, "conv=binary"},
        {Opt_obsolate, "conv=text"},
        {Opt_obsolate, "conv=auto"},
@@ -880,8 +914,6 @@ static const match_table_t fat_tokens = {
        {Opt_obsolate, "cvf_format=%20s"},
        {Opt_obsolate, "cvf_options=%100s"},
        {Opt_obsolate, "posix"},
-        {Opt_flush, "flush"},
-        {Opt_tz_utc, "tz=UTC"},
        {Opt_err, NULL},
 };
 static const match_table_t msdos_tokens = {
@@ -934,7 +966,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->fs_uid = current_uid();
        opts->fs_gid = current_gid();
-        opts->fs_fmask = current_umask();
+        opts->fs_fmask = opts->fs_dmask = current_umask();
        opts->allow_utime = -1;
        opts->codepage = fat_default_codepage;
        opts->iocharset = fat_default_iocharset;
@@ -951,6 +983,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->numtail = 1;
        opts->usefree = opts->nocase = 0;
        opts->tz_utc = 0;
+        opts->errors = FAT_ERRORS_RO;
        *debug = 0;
        if (!options)
@@ -1043,6 +1076,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                case Opt_tz_utc:
                        opts->tz_utc = 1;
                        break;
+                case Opt_err_cont:
+                        opts->errors = FAT_ERRORS_CONT;
+                        break;
+                case Opt_err_panic:
+                        opts->errors = FAT_ERRORS_PANIC;
+                        break;
+                case Opt_err_ro:
+                        opts->errors = FAT_ERRORS_RO;
+                        break;
                /* msdos specific */
                case Opt_dots:
@@ -1174,7 +1216,7 @@ static int fat_read_root(struct inode *inode)
 int fat_fill_super(struct super_block *sb, void *data, int silent,
                   const struct inode_operations *fs_dir_inode_ops, int isvfat)
 {
-        struct inode *root_inode = NULL;
+        struct inode *root_inode = NULL, *fat_inode = NULL;
        struct buffer_head *bh;
        struct fat_boot_sector *b;
        struct msdos_sb_info *sbi;
@@ -1414,6 +1456,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        }
        error = -ENOMEM;
+        fat_inode = new_inode(sb);
+        if (!fat_inode)
+                goto out_fail;
+        MSDOS_I(fat_inode)->i_pos = 0;
+        sbi->fat_inode = fat_inode;
        root_inode = new_inode(sb);
        if (!root_inode)
                goto out_fail;
@@ -1439,6 +1486,8 @@ out_invalid:
                       " on dev %s.\n", sb->s_id);
 out_fail:
+        if (fat_inode)
+                iput(fat_inode);
        if (root_inode)
                iput(root_inode);
        if (sbi->nls_io)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index ac39ebcc1496..a6c20473dfd7 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -12,14 +12,19 @@
 #include "fat.h"
 /*
- * fat_fs_panic reports a severe file system problem and sets the file system
+ * fat_fs_error reports a file system problem that might indicate fa data
- * read-only. The file system can be made writable again by remounting it.
+ * corruption/inconsistency. Depending on 'errors' mount option the
+ * panic() is called, or error message is printed FAT and nothing is done,
+ * or filesystem is remounted read-only (default behavior).
+ * In case the file system is remounted read-only, it can be made writable
+ * again by remounting it.
 */
-void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+void fat_fs_error(struct super_block *s, const char *fmt, ...)
 {
+        struct fat_mount_options *opts = &MSDOS_SB(s)->options;
        va_list args;
-        printk(KERN_ERR "FAT: Filesystem panic (dev %s)\n", s->s_id);
+        printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
        printk(KERN_ERR "    ");
        va_start(args, fmt);
@@ -27,13 +32,14 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...)
        va_end(args);
        printk("\n");
-        if (!(s->s_flags & MS_RDONLY)) {
+        if (opts->errors == FAT_ERRORS_PANIC)
+                panic("    FAT fs panic from previous error\n");
+        else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
                s->s_flags |= MS_RDONLY;
                printk(KERN_ERR "    File system has been set read-only\n");
        }
 }
+EXPORT_SYMBOL_GPL(fat_fs_error);
-EXPORT_SYMBOL_GPL(fat_fs_panic);
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
@@ -124,7 +130,7 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
                        mark_inode_dirty(inode);
        }
        if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
-                fat_fs_panic(sb, "clusters badly computed (%d != %llu)",
+                fat_fs_error(sb, "clusters badly computed (%d != %llu)",
                             new_fclus,
                             (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
                fat_cache_inval_inode(inode);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index da3f361a37dd..bbc94ae4fd77 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -9,7 +9,6 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "fat.h"
 /* Characters that are undesirable in an MS-DOS file name */
@@ -544,7 +543,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
                int start = MSDOS_I(new_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                if (IS_DIRSYNC(new_dir)) {
                        err = sync_dirty_buffer(dotdot_bh);
                        if (err)
@@ -586,7 +585,7 @@ error_dotdot:
                int start = MSDOS_I(old_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                corrupt |= sync_dirty_buffer(dotdot_bh);
        }
 error_inode:
@@ -608,7 +607,7 @@ error_inode:
                sinfo.bh = NULL;
        }
        if (corrupt < 0) {
-                fat_fs_panic(new_dir->i_sb,
+                fat_fs_error(new_dir->i_sb,
                             "%s: Filesystem corrupted (i_pos %lld)",
                             __func__, sinfo.i_pos);
        }
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a0e00e3a46e9..cb6e83557112 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -19,7 +19,6 @@
 #include <linux/jiffies.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
 #include "fat.h"
@@ -502,11 +501,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
        if (utf8) {
                int name_len = strlen(name);
-                *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+                *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
                /*
                 * We stripped '.'s before and set len appropriately,
-                 * but utf8_mbstowcs doesn't care about len
+                 * but utf8s_to_utf16s doesn't care about len
                 */
                *outlen -= (name_len - len);
@@ -965,7 +964,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
                int start = MSDOS_I(new_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                if (IS_DIRSYNC(new_dir)) {
                        err = sync_dirty_buffer(dotdot_bh);
                        if (err)
@@ -1009,7 +1008,7 @@ error_dotdot:
                int start = MSDOS_I(old_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                corrupt |= sync_dirty_buffer(dotdot_bh);
        }
 error_inode:
@@ -1030,7 +1029,7 @@ error_inode:
                sinfo.bh = NULL;
        }
        if (corrupt < 0) {
-                fat_fs_panic(new_dir->i_sb,
+                fat_fs_error(new_dir->i_sb,
                             "%s: Filesystem corrupted (i_pos %lld)",
                             __func__, sinfo.i_pos);
        }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 1ad703150dee..ae413086db97 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -19,7 +19,6 @@
 #include <linux/signal.h>
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
-#include <linux/smp_lock.h>
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -198,15 +197,19 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 }
 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
-                     uid_t uid, uid_t euid, int force)
+                     int force)
 {
        write_lock_irq(&filp->f_owner.lock);
        if (force || !filp->f_owner.pid) {
                put_pid(filp->f_owner.pid);
                filp->f_owner.pid = get_pid(pid);
                filp->f_owner.pid_type = type;
-                filp->f_owner.uid = uid;
-                filp->f_owner.euid = euid;
+                if (pid) {
+                        const struct cred *cred = current_cred();
+                        filp->f_owner.uid = cred->uid;
+                        filp->f_owner.euid = cred->euid;
+                }
        }
        write_unlock_irq(&filp->f_owner.lock);
 }
@@ -214,14 +217,13 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
 int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
                int force)
 {
-        const struct cred *cred = current_cred();
        int err;
-        
        err = security_file_set_fowner(filp);
        if (err)
                return err;
-        f_modown(filp, pid, type, cred->uid, cred->euid, force);
+        f_modown(filp, pid, type, force);
        return 0;
 }
 EXPORT_SYMBOL(__f_setown);
@@ -247,7 +249,7 @@ EXPORT_SYMBOL(f_setown);
 void f_delown(struct file *filp)
 {
-        f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1);
+        f_modown(filp, NULL, PIDTYPE_PID, 1);
 }
 pid_t f_getown(struct file *filp)
@@ -425,14 +427,20 @@ static inline int sigio_perm(struct task_struct *p,
 }
 static void send_sigio_to_task(struct task_struct *p,
-                               struct fown_struct *fown, 
+                               struct fown_struct *fown,
                               int fd,
                               int reason)
 {
-        if (!sigio_perm(p, fown, fown->signum))
+        /*
+         * F_SETSIG can change ->signum lockless in parallel, make
+         * sure we read it once and use the same value throughout.
+         */
+        int signum = ACCESS_ONCE(fown->signum);
+        if (!sigio_perm(p, fown, signum))
                return;
-        switch (fown->signum) {
+        switch (signum) {
                siginfo_t si;
                default:
                        /* Queue a rt signal with the appropriate fd as its
@@ -441,7 +449,7 @@ static void send_sigio_to_task(struct task_struct *p,
                           delivered even if we can't queue.  Failure to
                           queue in this case _should_ be reported; we fall
                           back to SIGIO in that case. --sct */
-                        si.si_signo = fown->signum;
+                        si.si_signo = signum;
                        si.si_errno = 0;
                        si.si_code  = reason;
                        /* Make sure we are called with one of the POLL_*
@@ -453,7 +461,7 @@ static void send_sigio_to_task(struct task_struct *p,
                        else
                                si.si_band = band_table[reason - POLL_IN];
                        si.si_fd    = fd;
-                        if (!group_send_sig_info(fown->signum, &si, p))
+                        if (!group_send_sig_info(signum, &si, p))
                                break;
                /* fall-through: fall back on the old plain SIGIO signal */
                case 0:
diff --git a/fs/file_table.c b/fs/file_table.c
index 54018fe48840..334ce39881f8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
         */
        if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
                file_take_write(file);
-                error = mnt_want_write(mnt);
+                error = mnt_clone_write(mnt);
                WARN_ON(error);
        }
        return error;
@@ -399,6 +399,44 @@ too_bad:
        return 0;
 }
+/**
+ *      mark_files_ro - mark all files read-only
+ *      @sb: superblock in question
+ *
+ *      All files are marked read-only.  We don't care about pending
+ *      delete files so this should be used in 'force' mode only.
+ */
+void mark_files_ro(struct super_block *sb)
+{
+        struct file *f;
+retry:
+        file_list_lock();
+        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+                struct vfsmount *mnt;
+                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+                       continue;
+                if (!file_count(f))
+                        continue;
+                if (!(f->f_mode & FMODE_WRITE))
+                        continue;
+                f->f_mode &= ~FMODE_WRITE;
+                if (file_check_writeable(f) != 0)
+                        continue;
+                file_release_write(f);
+                mnt = mntget(f->f_path.mnt);
+                file_list_unlock();
+                /*
+                 * This can sleep, so we can't hold
+                 * the file_list_lock() spinlock.
+                 */
+                mnt_drop_write(mnt);
+                mntput(mnt);
+                goto retry;
+        }
+        file_list_unlock();
+}
 void __init files_init(unsigned long mempages)
 { 
        int n; 
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1dacda831577..1e8af939b3e4 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,6 +38,7 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
@@ -80,12 +81,16 @@ vxfs_put_super(struct super_block *sbp)
 {
        struct vxfs_sb_info     *infp = VXFS_SBI(sbp);
+        lock_kernel();
        vxfs_put_fake_inode(infp->vsi_fship);
        vxfs_put_fake_inode(infp->vsi_ilist);
        vxfs_put_fake_inode(infp->vsi_stilist);
        brelse(infp->vsi_bp);
        kfree(infp);
+        unlock_kernel();
 }
 /**
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91013ff7dd53..c54226be5294 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -64,6 +64,28 @@ static void writeback_release(struct backing_dev_info *bdi)
        clear_bit(BDI_pdflush, &bdi->state);
 }
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+                struct dentry *dentry;
+                const char *name = "?";
+                dentry = d_find_alias(inode);
+                if (dentry) {
+                        spin_lock(&dentry->d_lock);
+                        name = (const char *) dentry->d_name.name;
+                }
+                printk(KERN_DEBUG
+                       "%s(%d): dirtied inode %lu (%s) on %s\n",
+                       current->comm, task_pid_nr(current), inode->i_ino,
+                       name, inode->i_sb->s_id);
+                if (dentry) {
+                        spin_unlock(&dentry->d_lock);
+                        dput(dentry);
+                }
+        }
+}
 /**
 *      __mark_inode_dirty -    internal function
 *      @inode: inode to mark
@@ -114,23 +136,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
        if ((inode->i_state & flags) == flags)
                return;
-        if (unlikely(block_dump)) {
+        if (unlikely(block_dump))
-                struct dentry *dentry = NULL;
+                block_dump___mark_inode_dirty(inode);
-                const char *name = "?";
-                if (!list_empty(&inode->i_dentry)) {
-                        dentry = list_entry(inode->i_dentry.next,
-                                            struct dentry, d_alias);
-                        if (dentry && dentry->d_name.name)
-                                name = (const char *) dentry->d_name.name;
-                }
-                if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev"))
-                        printk(KERN_DEBUG
-                               "%s(%d): dirtied inode %lu (%s) on %s\n",
-                               current->comm, task_pid_nr(current), inode->i_ino,
-                               name, inode->i_sb->s_id);
-        }
        spin_lock(&inode_lock);
        if ((inode->i_state & flags) != flags) {
@@ -271,7 +278,26 @@ int sb_has_dirty_inodes(struct super_block *sb)
 EXPORT_SYMBOL(sb_has_dirty_inodes);
 /*
- * Write a single inode's dirty pages and inode data out to disk.
+ * Wait for writeback on an inode to complete.
+ */
+static void inode_wait_for_writeback(struct inode *inode)
+{
+        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
+        wait_queue_head_t *wqh;
+        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
+        do {
+                spin_unlock(&inode_lock);
+                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
+                spin_lock(&inode_lock);
+        } while (inode->i_state & I_SYNC);
+}
+/*
+ * Write out an inode's dirty pages.  Called under inode_lock.  Either the
+ * caller has ref on the inode (either via __iget or via syscall against an fd)
+ * or the inode has I_WILL_FREE set (via generic_forget_inode)
+ *
 * If `wait' is set, wait on the writeout.
 *
 * The whole writeout design is quite complex and fragile.  We want to avoid
@@ -281,15 +307,39 @@ EXPORT_SYMBOL(sb_has_dirty_inodes);
 * Called under inode_lock.
 */
 static int
-__sync_single_inode(struct inode *inode, struct writeback_control *wbc)
+writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        unsigned dirty;
        struct address_space *mapping = inode->i_mapping;
        int wait = wbc->sync_mode == WB_SYNC_ALL;
+        unsigned dirty;
        int ret;
+        if (!atomic_read(&inode->i_count))
+                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
+        else
+                WARN_ON(inode->i_state & I_WILL_FREE);
+        if (inode->i_state & I_SYNC) {
+                /*
+                 * If this inode is locked for writeback and we are not doing
+                 * writeback-for-data-integrity, move it to s_more_io so that
+                 * writeback can proceed with the other inodes on s_io.
+                 *
+                 * We'll have another go at writing back this inode when we
+                 * completed a full scan of s_io.
+                 */
+                if (!wait) {
+                        requeue_io(inode);
+                        return 0;
+                }
+                /*
+                 * It's a data-integrity sync.  We must wait.
+                 */
+                inode_wait_for_writeback(inode);
+        }
        BUG_ON(inode->i_state & I_SYNC);
-        WARN_ON(inode->i_state & I_NEW);
        /* Set I_SYNC, reset I_DIRTY */
        dirty = inode->i_state & I_DIRTY;
@@ -314,9 +364,8 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
        }
        spin_lock(&inode_lock);
-        WARN_ON(inode->i_state & I_NEW);
        inode->i_state &= ~I_SYNC;
-        if (!(inode->i_state & I_FREEING)) {
+        if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
                if (!(inode->i_state & I_DIRTY) &&
                    mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                        /*
@@ -385,50 +434,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 }
 /*
- * Write out an inode's dirty pages.  Called under inode_lock.  Either the
- * caller has ref on the inode (either via __iget or via syscall against an fd)
- * or the inode has I_WILL_FREE set (via generic_forget_inode)
- */
-static int
-__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
-{
-        wait_queue_head_t *wqh;
-        if (!atomic_read(&inode->i_count))
-                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
-        else
-                WARN_ON(inode->i_state & I_WILL_FREE);
-        if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) {
-                /*
-                 * We're skipping this inode because it's locked, and we're not
-                 * doing writeback-for-data-integrity.  Move it to s_more_io so
-                 * that writeback can proceed with the other inodes on s_io.
-                 * We'll have another go at writing back this inode when we
-                 * completed a full scan of s_io.
-                 */
-                requeue_io(inode);
-                return 0;
-        }
-        /*
-         * It's a data-integrity sync.  We must wait.
-         */
-        if (inode->i_state & I_SYNC) {
-                DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
-                wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-                do {
-                        spin_unlock(&inode_lock);
-                        __wait_on_bit(wqh, &wq, inode_wait,
-                                                        TASK_UNINTERRUPTIBLE);
-                        spin_lock(&inode_lock);
-                } while (inode->i_state & I_SYNC);
-        }
-        return __sync_single_inode(inode, wbc);
-}
-/*
 * Write out a superblock's list of dirty inodes.  A wait will be performed
 * upon no inodes, all inodes or the final one, depending upon sync_mode.
 *
@@ -487,7 +492,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
                        break;
                }
-                if (inode->i_state & I_NEW) {
+                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
@@ -518,10 +523,10 @@ void generic_sync_sb_inodes(struct super_block *sb,
                if (current_is_pdflush() && !writeback_acquire(bdi))
                        break;
-                BUG_ON(inode->i_state & I_FREEING);
+                BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
-                __writeback_single_inode(inode, wbc);
+                writeback_single_inode(inode, wbc);
                if (current_is_pdflush())
                        writeback_release(bdi);
                if (wbc->pages_skipped != pages_skipped) {
@@ -679,55 +684,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 }
 /**
- * sync_inodes - writes all inodes to disk
- * @wait: wait for completion
- *
- * sync_inodes() goes through each super block's dirty inode list, writes the
- * inodes out, waits on the writeout and puts the inodes back on the normal
- * list.
- *
- * This is for sys_sync().  fsync_dev() uses the same algorithm.  The subtle
- * part of the sync functions is that the blockdev "superblock" is processed
- * last.  This is because the write_inode() function of a typical fs will
- * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
- * What we want to do is to perform all that dirtying first, and then write
- * back all those inode blocks via the blockdev mapping in one sweep.  So the
- * additional (somewhat redundant) sync_blockdev() calls here are to make
- * sure that really happens.  Because if we call sync_inodes_sb(wait=1) with
- * outstanding dirty inodes, the writeback goes block-at-a-time within the
- * filesystem's write_inode().  This is extremely slow.
- */
-static void __sync_inodes(int wait)
-{
-        struct super_block *sb;
-        spin_lock(&sb_lock);
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root) {
-                        sync_inodes_sb(sb, wait);
-                        sync_blockdev(sb->s_bdev);
-                }
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-}
-void sync_inodes(int wait)
-{
-        __sync_inodes(0);
-        if (wait)
-                __sync_inodes(1);
-}
-/**
 * write_inode_now      -       write an inode to disk
 * @inode: inode to write to disk
 * @sync: whether the write should be synchronous or not
@@ -752,7 +708,7 @@ int write_inode_now(struct inode *inode, int sync)
        might_sleep();
        spin_lock(&inode_lock);
-        ret = __writeback_single_inode(inode, &wbc);
+        ret = writeback_single_inode(inode, &wbc);
        spin_unlock(&inode_lock);
        if (sync)
                inode_sync_wait(inode);
@@ -776,7 +732,7 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
        int ret;
        spin_lock(&inode_lock);
-        ret = __writeback_single_inode(inode, wbc);
+        ret = writeback_single_inode(inode, wbc);
        spin_unlock(&inode_lock);
        return ret;
 }
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 72437065f6ad..e95eeb445e58 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -3,5 +3,6 @@
 #
 obj-$(CONFIG_FUSE_FS) += fuse.o
+obj-$(CONFIG_CUSE) += cuse.o
 fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 000000000000..de792dcf3274
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,610 @@
+/*
+ * CUSE: Character device in Userspace
+ *
+ * Copyright (C) 2008-2009  SUSE Linux Products GmbH
+ * Copyright (C) 2008-2009  Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * CUSE enables character devices to be implemented from userland much
+ * like FUSE allows filesystems.  On initialization /dev/cuse is
+ * created.  By opening the file and replying to the CUSE_INIT request
+ * userland CUSE server can create a character device.  After that the
+ * operation is very similar to FUSE.
+ *
+ * A CUSE instance involves the following objects.
+ *
+ * cuse_conn    : contains fuse_conn and serves as bonding structure
+ * channel      : file handle connected to the userland CUSE server
+ * cdev         : the implemented character device
+ * dev          : generic device for cdev
+ *
+ * Note that 'channel' is what 'dev' is in FUSE.  As CUSE deals with
+ * devices, it's called 'channel' to reduce confusion.
+ *
+ * channel determines when the character device dies.  When channel is
+ * closed, everything begins to destruct.  The cuse_conn is taken off
+ * the lookup table preventing further access from cdev, cdev and
+ * generic device are removed and the base reference of cuse_conn is
+ * put.
+ *
+ * On each open, the matching cuse_conn is looked up and if found an
+ * additional reference is taken which is released when the file is
+ * closed.
+ */
+#include <linux/fuse.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include "fuse_i.h"
+#define CUSE_CONNTBL_LEN        64
+struct cuse_conn {
+        struct list_head        list;   /* linked on cuse_conntbl */
+        struct fuse_conn        fc;     /* fuse connection */
+        struct cdev             *cdev;  /* associated character device */
+        struct device           *dev;   /* device representing @cdev */
+        /* init parameters, set once during initialization */
+        bool                    unrestricted_ioctl;
+};
+static DEFINE_SPINLOCK(cuse_lock);              /* protects cuse_conntbl */
+static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
+static struct class *cuse_class;
+static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
+{
+        return container_of(fc, struct cuse_conn, fc);
+}
+static struct list_head *cuse_conntbl_head(dev_t devt)
+{
+        return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
+}
+/**************************************************************************
+ * CUSE frontend operations
+ *
+ * These are file operations for the character device.
+ *
+ * On open, CUSE opens a file from the FUSE mnt and stores it to
+ * private_data of the open file.  All other ops call FUSE ops on the
+ * FUSE file.
+ */
+static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
+                         loff_t *ppos)
+{
+        loff_t pos = 0;
+        return fuse_direct_io(file, buf, count, &pos, 0);
+}
+static ssize_t cuse_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+        loff_t pos = 0;
+        /*
+         * No locking or generic_write_checks(), the server is
+         * responsible for locking and sanity checks.
+         */
+        return fuse_direct_io(file, buf, count, &pos, 1);
+}
+static int cuse_open(struct inode *inode, struct file *file)
+{
+        dev_t devt = inode->i_cdev->dev;
+        struct cuse_conn *cc = NULL, *pos;
+        int rc;
+        /* look up and get the connection */
+        spin_lock(&cuse_lock);
+        list_for_each_entry(pos, cuse_conntbl_head(devt), list)
+                if (pos->dev->devt == devt) {
+                        fuse_conn_get(&pos->fc);
+                        cc = pos;
+                        break;
+                }
+        spin_unlock(&cuse_lock);
+        /* dead? */
+        if (!cc)
+                return -ENODEV;
+        /*
+         * Generic permission check is already done against the chrdev
+         * file, proceed to open.
+         */
+        rc = fuse_do_open(&cc->fc, 0, file, 0);
+        if (rc)
+                fuse_conn_put(&cc->fc);
+        return rc;
+}
+static int cuse_release(struct inode *inode, struct file *file)
+{
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = ff->fc;
+        fuse_sync_release(ff, file->f_flags);
+        fuse_conn_put(fc);
+        return 0;
+}
+static long cuse_file_ioctl(struct file *file, unsigned int cmd,
+                            unsigned long arg)
+{
+        struct fuse_file *ff = file->private_data;
+        struct cuse_conn *cc = fc_to_cc(ff->fc);
+        unsigned int flags = 0;
+        if (cc->unrestricted_ioctl)
+                flags |= FUSE_IOCTL_UNRESTRICTED;
+        return fuse_do_ioctl(file, cmd, arg, flags);
+}
+static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+                                   unsigned long arg)
+{
+        struct fuse_file *ff = file->private_data;
+        struct cuse_conn *cc = fc_to_cc(ff->fc);
+        unsigned int flags = FUSE_IOCTL_COMPAT;
+        if (cc->unrestricted_ioctl)
+                flags |= FUSE_IOCTL_UNRESTRICTED;
+        return fuse_do_ioctl(file, cmd, arg, flags);
+}
+static const struct file_operations cuse_frontend_fops = {
+        .owner                  = THIS_MODULE,
+        .read                   = cuse_read,
+        .write                  = cuse_write,
+        .open                   = cuse_open,
+        .release                = cuse_release,
+        .unlocked_ioctl         = cuse_file_ioctl,
+        .compat_ioctl           = cuse_file_compat_ioctl,
+        .poll                   = fuse_file_poll,
+};
+/**************************************************************************
+ * CUSE channel initialization and destruction
+ */
+struct cuse_devinfo {
+        const char              *name;
+};
+/**
+ * cuse_parse_one - parse one key=value pair
+ * @pp: i/o parameter for the current position
+ * @end: points to one past the end of the packed string
+ * @keyp: out parameter for key
+ * @valp: out parameter for value
+ *
+ * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
+ * at @end - 1.  This function parses one pair and set *@keyp to the
+ * start of the key and *@valp to the start of the value.  Note that
+ * the original string is modified such that the key string is
+ * terminated with '\0'.  *@pp is updated to point to the next string.
+ *
+ * RETURNS:
+ * 1 on successful parse, 0 on EOF, -errno on failure.
+ */
+static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
+{
+        char *p = *pp;
+        char *key, *val;
+        while (p < end && *p == '\0')
+                p++;
+        if (p == end)
+                return 0;
+        if (end[-1] != '\0') {
+                printk(KERN_ERR "CUSE: info not properly terminated\n");
+                return -EINVAL;
+        }
+        key = val = p;
+        p += strlen(p);
+        if (valp) {
+                strsep(&val, "=");
+                if (!val)
+                        val = key + strlen(key);
+                key = strstrip(key);
+                val = strstrip(val);
+        } else
+                key = strstrip(key);
+        if (!strlen(key)) {
+                printk(KERN_ERR "CUSE: zero length info key specified\n");
+                return -EINVAL;
+        }
+        *pp = p;
+        *keyp = key;
+        if (valp)
+                *valp = val;
+        return 1;
+}
+/**
+ * cuse_parse_dev_info - parse device info
+ * @p: device info string
+ * @len: length of device info string
+ * @devinfo: out parameter for parsed device info
+ *
+ * Parse @p to extract device info and store it into @devinfo.  String
+ * pointed to by @p is modified by parsing and @devinfo points into
+ * them, so @p shouldn't be freed while @devinfo is in use.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
+{
+        char *end = p + len;
+        char *key, *val;
+        int rc;
+        while (true) {
+                rc = cuse_parse_one(&p, end, &key, &val);
+                if (rc < 0)
+                        return rc;
+                if (!rc)
+                        break;
+                if (strcmp(key, "DEVNAME") == 0)
+                        devinfo->name = val;
+                else
+                        printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
+                               key);
+        }
+        if (!devinfo->name || !strlen(devinfo->name)) {
+                printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static void cuse_gendev_release(struct device *dev)
+{
+        kfree(dev);
+}
+/**
+ * cuse_process_init_reply - finish initializing CUSE channel
+ *
+ * This function creates the character device and sets up all the
+ * required data structures for it.  Please read the comment at the
+ * top of this file for high level overview.
+ */
+static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+        struct cuse_conn *cc = fc_to_cc(fc);
+        struct cuse_init_out *arg = &req->misc.cuse_init_out;
+        struct page *page = req->pages[0];
+        struct cuse_devinfo devinfo = { };
+        struct device *dev;
+        struct cdev *cdev;
+        dev_t devt;
+        int rc;
+        if (req->out.h.error ||
+            arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
+                goto err;
+        }
+        fc->minor = arg->minor;
+        fc->max_read = max_t(unsigned, arg->max_read, 4096);
+        fc->max_write = max_t(unsigned, arg->max_write, 4096);
+        /* parse init reply */
+        cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
+        rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
+                                &devinfo);
+        if (rc)
+                goto err;
+        /* determine and reserve devt */
+        devt = MKDEV(arg->dev_major, arg->dev_minor);
+        if (!MAJOR(devt))
+                rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
+        else
+                rc = register_chrdev_region(devt, 1, devinfo.name);
+        if (rc) {
+                printk(KERN_ERR "CUSE: failed to register chrdev region\n");
+                goto err;
+        }
+        /* devt determined, create device */
+        rc = -ENOMEM;
+        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+        if (!dev)
+                goto err_region;
+        device_initialize(dev);
+        dev_set_uevent_suppress(dev, 1);
+        dev->class = cuse_class;
+        dev->devt = devt;
+        dev->release = cuse_gendev_release;
+        dev_set_drvdata(dev, cc);
+        dev_set_name(dev, "%s", devinfo.name);
+        rc = device_add(dev);
+        if (rc)
+                goto err_device;
+        /* register cdev */
+        rc = -ENOMEM;
+        cdev = cdev_alloc();
+        if (!cdev)
+                goto err_device;
+        cdev->owner = THIS_MODULE;
+        cdev->ops = &cuse_frontend_fops;
+        rc = cdev_add(cdev, devt, 1);
+        if (rc)
+                goto err_cdev;
+        cc->dev = dev;
+        cc->cdev = cdev;
+        /* make the device available */
+        spin_lock(&cuse_lock);
+        list_add(&cc->list, cuse_conntbl_head(devt));
+        spin_unlock(&cuse_lock);
+        /* announce device availability */
+        dev_set_uevent_suppress(dev, 0);
+        kobject_uevent(&dev->kobj, KOBJ_ADD);
+out:
+        __free_page(page);
+        return;
+err_cdev:
+        cdev_del(cdev);
+err_device:
+        put_device(dev);
+err_region:
+        unregister_chrdev_region(devt, 1);
+err:
+        fc->conn_error = 1;
+        goto out;
+}
+static int cuse_send_init(struct cuse_conn *cc)
+{
+        int rc;
+        struct fuse_req *req;
+        struct page *page;
+        struct fuse_conn *fc = &cc->fc;
+        struct cuse_init_in *arg;
+        BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
+        req = fuse_get_req(fc);
+        if (IS_ERR(req)) {
+                rc = PTR_ERR(req);
+                goto err;
+        }
+        rc = -ENOMEM;
+        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        if (!page)
+                goto err_put_req;
+        arg = &req->misc.cuse_init_in;
+        arg->major = FUSE_KERNEL_VERSION;
+        arg->minor = FUSE_KERNEL_MINOR_VERSION;
+        arg->flags |= CUSE_UNRESTRICTED_IOCTL;
+        req->in.h.opcode = CUSE_INIT;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(struct cuse_init_in);
+        req->in.args[0].value = arg;
+        req->out.numargs = 2;
+        req->out.args[0].size = sizeof(struct cuse_init_out);
+        req->out.args[0].value = &req->misc.cuse_init_out;
+        req->out.args[1].size = CUSE_INIT_INFO_MAX;
+        req->out.argvar = 1;
+        req->out.argpages = 1;
+        req->pages[0] = page;
+        req->num_pages = 1;
+        req->end = cuse_process_init_reply;
+        fuse_request_send_background(fc, req);
+        return 0;
+err_put_req:
+        fuse_put_request(fc, req);
+err:
+        return rc;
+}
+static void cuse_fc_release(struct fuse_conn *fc)
+{
+        struct cuse_conn *cc = fc_to_cc(fc);
+        kfree(cc);
+}
+/**
+ * cuse_channel_open - open method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being opened
+ *
+ * Userland CUSE server can create a CUSE device by opening /dev/cuse
+ * and replying to the initilaization request kernel sends.  This
+ * function is responsible for handling CUSE device initialization.
+ * Because the fd opened by this function is used during
+ * initialization, this function only creates cuse_conn and sends
+ * init.  The rest is delegated to a kthread.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_open(struct inode *inode, struct file *file)
+{
+        struct cuse_conn *cc;
+        int rc;
+        /* set up cuse_conn */
+        cc = kzalloc(sizeof(*cc), GFP_KERNEL);
+        if (!cc)
+                return -ENOMEM;
+        fuse_conn_init(&cc->fc);
+        INIT_LIST_HEAD(&cc->list);
+        cc->fc.release = cuse_fc_release;
+        cc->fc.connected = 1;
+        cc->fc.blocked = 0;
+        rc = cuse_send_init(cc);
+        if (rc) {
+                fuse_conn_put(&cc->fc);
+                return rc;
+        }
+        file->private_data = &cc->fc;   /* channel owns base reference to cc */
+        return 0;
+}
+/**
+ * cuse_channel_release - release method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being closed
+ *
+ * Disconnect the channel, deregister CUSE device and initiate
+ * destruction by putting the default reference.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_release(struct inode *inode, struct file *file)
+{
+        struct cuse_conn *cc = fc_to_cc(file->private_data);
+        int rc;
+        /* remove from the conntbl, no more access from this point on */
+        spin_lock(&cuse_lock);
+        list_del_init(&cc->list);
+        spin_unlock(&cuse_lock);
+        /* remove device */
+        if (cc->dev)
+                device_unregister(cc->dev);
+        if (cc->cdev) {
+                unregister_chrdev_region(cc->cdev->dev, 1);
+                cdev_del(cc->cdev);
+        }
+        /* kill connection and shutdown channel */
+        fuse_conn_kill(&cc->fc);
+        rc = fuse_dev_release(inode, file);     /* puts the base reference */
+        return rc;
+}
+static struct file_operations cuse_channel_fops; /* initialized during init */
+/**************************************************************************
+ * Misc stuff and module initializatiion
+ *
+ * CUSE exports the same set of attributes to sysfs as fusectl.
+ */
+static ssize_t cuse_class_waiting_show(struct device *dev,
+                                       struct device_attribute *attr, char *buf)
+{
+        struct cuse_conn *cc = dev_get_drvdata(dev);
+        return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+}
+static ssize_t cuse_class_abort_store(struct device *dev,
+                                      struct device_attribute *attr,
+                                      const char *buf, size_t count)
+{
+        struct cuse_conn *cc = dev_get_drvdata(dev);
+        fuse_abort_conn(&cc->fc);
+        return count;
+}
+static struct device_attribute cuse_class_dev_attrs[] = {
+        __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL),
+        __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store),
+        { }
+};
+static struct miscdevice cuse_miscdev = {
+        .minor          = MISC_DYNAMIC_MINOR,
+        .name           = "cuse",
+        .fops           = &cuse_channel_fops,
+};
+static int __init cuse_init(void)
+{
+        int i, rc;
+        /* init conntbl */
+        for (i = 0; i < CUSE_CONNTBL_LEN; i++)
+                INIT_LIST_HEAD(&cuse_conntbl[i]);
+        /* inherit and extend fuse_dev_operations */
+        cuse_channel_fops               = fuse_dev_operations;
+        cuse_channel_fops.owner         = THIS_MODULE;
+        cuse_channel_fops.open          = cuse_channel_open;
+        cuse_channel_fops.release       = cuse_channel_release;
+        cuse_class = class_create(THIS_MODULE, "cuse");
+        if (IS_ERR(cuse_class))
+                return PTR_ERR(cuse_class);
+        cuse_class->dev_attrs = cuse_class_dev_attrs;
+        rc = misc_register(&cuse_miscdev);
+        if (rc) {
+                class_destroy(cuse_class);
+                return rc;
+        }
+        return 0;
+}
+static void __exit cuse_exit(void)
+{
+        misc_deregister(&cuse_miscdev);
+        class_destroy(cuse_class);
+}
+module_init(cuse_init);
+module_exit(cuse_exit);
+MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
+MODULE_DESCRIPTION("Character device in Userspace");
+MODULE_LICENSE("GPL");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ba76b68c52ff..6484eb75acd6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -46,6 +46,7 @@ struct fuse_req *fuse_request_alloc(void)
                fuse_request_init(req);
        return req;
 }
+EXPORT_SYMBOL_GPL(fuse_request_alloc);
 struct fuse_req *fuse_request_alloc_nofs(void)
 {
@@ -124,6 +125,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
        atomic_dec(&fc->num_waiting);
        return ERR_PTR(err);
 }
+EXPORT_SYMBOL_GPL(fuse_get_req);
 /*
 * Return request in fuse_file->reserved_req.  However that may
@@ -208,6 +210,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
                        fuse_request_free(req);
        }
 }
+EXPORT_SYMBOL_GPL(fuse_put_request);
 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
 {
@@ -282,9 +285,9 @@ __releases(&fc->lock)
                        wake_up_all(&fc->blocked_waitq);
                }
                if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
-                    fc->connected) {
+                    fc->connected && fc->bdi_initialized) {
-                        clear_bdi_congested(&fc->bdi, READ);
+                        clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
-                        clear_bdi_congested(&fc->bdi, WRITE);
+                        clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
                }
                fc->num_background--;
                fc->active_background--;
@@ -400,6 +403,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
        }
        spin_unlock(&fc->lock);
 }
+EXPORT_SYMBOL_GPL(fuse_request_send);
 static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
                                            struct fuse_req *req)
@@ -408,9 +412,10 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
        fc->num_background++;
        if (fc->num_background == FUSE_MAX_BACKGROUND)
                fc->blocked = 1;
-        if (fc->num_background == FUSE_CONGESTION_THRESHOLD) {
+        if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
-                set_bdi_congested(&fc->bdi, READ);
+            fc->bdi_initialized) {
-                set_bdi_congested(&fc->bdi, WRITE);
+                set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
+                set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
        }
        list_add_tail(&req->list, &fc->bg_queue);
        flush_bg_queue(fc);
@@ -439,6 +444,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
        req->isreply = 1;
        fuse_request_send_nowait(fc, req);
 }
+EXPORT_SYMBOL_GPL(fuse_request_send_background);
 /*
 * Called under fc->lock
@@ -843,6 +849,81 @@ err:
        return err;
 }
+static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
+                                   struct fuse_copy_state *cs)
+{
+        struct fuse_notify_inval_inode_out outarg;
+        int err = -EINVAL;
+        if (size != sizeof(outarg))
+                goto err;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto err;
+        fuse_copy_finish(cs);
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (!fc->sb)
+                goto err_unlock;
+        err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+                                       outarg.off, outarg.len);
+err_unlock:
+        up_read(&fc->killsb);
+        return err;
+err:
+        fuse_copy_finish(cs);
+        return err;
+}
+static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
+                                   struct fuse_copy_state *cs)
+{
+        struct fuse_notify_inval_entry_out outarg;
+        int err = -EINVAL;
+        char buf[FUSE_NAME_MAX+1];
+        struct qstr name;
+        if (size < sizeof(outarg))
+                goto err;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto err;
+        err = -ENAMETOOLONG;
+        if (outarg.namelen > FUSE_NAME_MAX)
+                goto err;
+        name.name = buf;
+        name.len = outarg.namelen;
+        err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+        if (err)
+                goto err;
+        fuse_copy_finish(cs);
+        buf[outarg.namelen] = 0;
+        name.hash = full_name_hash(name.name, name.len);
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (!fc->sb)
+                goto err_unlock;
+        err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+err_unlock:
+        up_read(&fc->killsb);
+        return err;
+err:
+        fuse_copy_finish(cs);
+        return err;
+}
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -850,6 +931,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
        case FUSE_NOTIFY_POLL:
                return fuse_notify_poll(fc, size, cs);
+        case FUSE_NOTIFY_INVAL_INODE:
+                return fuse_notify_inval_inode(fc, size, cs);
+        case FUSE_NOTIFY_INVAL_ENTRY:
+                return fuse_notify_inval_entry(fc, size, cs);
        default:
                fuse_copy_finish(cs);
                return -EINVAL;
@@ -904,7 +991,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
 {
        int err;
-        unsigned nbytes = iov_length(iov, nr_segs);
+        size_t nbytes = iov_length(iov, nr_segs);
        struct fuse_req *req;
        struct fuse_out_header oh;
        struct fuse_copy_state cs;
@@ -1105,8 +1192,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
        }
        spin_unlock(&fc->lock);
 }
+EXPORT_SYMBOL_GPL(fuse_abort_conn);
-static int fuse_dev_release(struct inode *inode, struct file *file)
+int fuse_dev_release(struct inode *inode, struct file *file)
 {
        struct fuse_conn *fc = fuse_get_conn(file);
        if (fc) {
@@ -1120,6 +1208,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
        return 0;
 }
+EXPORT_SYMBOL_GPL(fuse_dev_release);
 static int fuse_dev_fasync(int fd, struct file *file, int on)
 {
@@ -1142,6 +1231,7 @@ const struct file_operations fuse_dev_operations = {
        .release        = fuse_dev_release,
        .fasync         = fuse_dev_fasync,
 };
+EXPORT_SYMBOL_GPL(fuse_dev_operations);
 static struct miscdevice fuse_miscdevice = {
        .minor = FUSE_MINOR,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8b8eebc5614b..e703654e7f40 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -362,19 +362,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 }
 /*
- * Synchronous release for the case when something goes wrong in CREATE_OPEN
- */
-static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
-                              u64 nodeid, int flags)
-{
-        fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
-        ff->reserved_req->force = 1;
-        fuse_request_send(fc, ff->reserved_req);
-        fuse_put_request(fc, ff->reserved_req);
-        kfree(ff);
-}
-/*
 * Atomic create+open operation
 *
 * If the filesystem doesn't support this, then fall back to separate
@@ -388,7 +375,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
        struct fuse_req *forget_req;
-        struct fuse_open_in inarg;
+        struct fuse_create_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
        struct fuse_file *ff;
@@ -412,15 +399,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!ff)
                goto out_put_request;
+        if (!fc->dont_mask)
+                mode &= ~current_umask();
        flags &= ~O_NOCTTY;
        memset(&inarg, 0, sizeof(inarg));
        memset(&outentry, 0, sizeof(outentry));
        inarg.flags = flags;
        inarg.mode = mode;
+        inarg.umask = current_umask();
        req->in.h.opcode = FUSE_CREATE;
        req->in.h.nodeid = get_node_id(dir);
        req->in.numargs = 2;
-        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
+                                                sizeof(inarg);
        req->in.args[0].value = &inarg;
        req->in.args[1].size = entry->d_name.len + 1;
        req->in.args[1].value = entry->d_name.name;
@@ -445,12 +437,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
                goto out_free_ff;
        fuse_put_request(fc, req);
+        ff->fh = outopen.fh;
+        ff->nodeid = outentry.nodeid;
+        ff->open_flags = outopen.open_flags;
        inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
                          &outentry.attr, entry_attr_timeout(&outentry), 0);
        if (!inode) {
                flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
-                ff->fh = outopen.fh;
+                fuse_sync_release(ff, flags);
-                fuse_sync_release(fc, ff, outentry.nodeid, flags);
                fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
                return -ENOMEM;
        }
@@ -460,11 +454,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        fuse_invalidate_attr(dir);
        file = lookup_instantiate_filp(nd, entry, generic_file_open);
        if (IS_ERR(file)) {
-                ff->fh = outopen.fh;
+                fuse_sync_release(ff, flags);
-                fuse_sync_release(fc, ff, outentry.nodeid, flags);
                return PTR_ERR(file);
        }
-        fuse_finish_open(inode, file, ff, &outopen);
+        file->private_data = fuse_file_get(ff);
+        fuse_finish_open(inode, file);
        return 0;
 out_free_ff:
@@ -557,12 +551,17 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
        if (IS_ERR(req))
                return PTR_ERR(req);
+        if (!fc->dont_mask)
+                mode &= ~current_umask();
        memset(&inarg, 0, sizeof(inarg));
        inarg.mode = mode;
        inarg.rdev = new_encode_dev(rdev);
+        inarg.umask = current_umask();
        req->in.h.opcode = FUSE_MKNOD;
        req->in.numargs = 2;
-        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
+                                                sizeof(inarg);
        req->in.args[0].value = &inarg;
        req->in.args[1].size = entry->d_name.len + 1;
        req->in.args[1].value = entry->d_name.name;
@@ -589,8 +588,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
        if (IS_ERR(req))
                return PTR_ERR(req);
+        if (!fc->dont_mask)
+                mode &= ~current_umask();
        memset(&inarg, 0, sizeof(inarg));
        inarg.mode = mode;
+        inarg.umask = current_umask();
        req->in.h.opcode = FUSE_MKDIR;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(inarg);
@@ -856,6 +859,43 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
        return err;
 }
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+                             struct qstr *name)
+{
+        int err = -ENOTDIR;
+        struct inode *parent;
+        struct dentry *dir;
+        struct dentry *entry;
+        parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+        if (!parent)
+                return -ENOENT;
+        mutex_lock(&parent->i_mutex);
+        if (!S_ISDIR(parent->i_mode))
+                goto unlock;
+        err = -ENOENT;
+        dir = d_find_alias(parent);
+        if (!dir)
+                goto unlock;
+        entry = d_lookup(dir, name);
+        dput(dir);
+        if (!entry)
+                goto unlock;
+        fuse_invalidate_attr(parent);
+        fuse_invalidate_entry(entry);
+        dput(entry);
+        err = 0;
+ unlock:
+        mutex_unlock(&parent->i_mutex);
+        iput(parent);
+        return err;
+}
 /*
 * Calling into a user-controlled filesystem gives the filesystem
 * daemon ptrace-like capabilities over the requester process.  This
@@ -1035,7 +1075,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
-        fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+        fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
        fuse_request_send(fc, req);
        nbytes = req->out.args[0].size;
        err = req->out.h.error;
@@ -1101,12 +1141,14 @@ static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
 static int fuse_dir_open(struct inode *inode, struct file *file)
 {
-        return fuse_open_common(inode, file, 1);
+        return fuse_open_common(inode, file, true);
 }
 static int fuse_dir_release(struct inode *inode, struct file *file)
 {
-        return fuse_release_common(inode, file, 1);
+        fuse_release_common(file, FUSE_RELEASEDIR);
+        return 0;
 }
 static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 06f30e965676..cbc464043b6f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,13 +12,13 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/module.h>
 static const struct file_operations fuse_direct_io_file_operations;
-static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
+static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
-                          struct fuse_open_out *outargp)
+                          int opcode, struct fuse_open_out *outargp)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_open_in inarg;
        struct fuse_req *req;
        int err;
@@ -31,8 +31,8 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
        inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
        if (!fc->atomic_o_trunc)
                inarg.flags &= ~O_TRUNC;
-        req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+        req->in.h.opcode = opcode;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -49,22 +49,27 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
        struct fuse_file *ff;
        ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
-        if (ff) {
+        if (unlikely(!ff))
-                ff->reserved_req = fuse_request_alloc();
+                return NULL;
-                if (!ff->reserved_req) {
-                        kfree(ff);
+        ff->fc = fc;
-                        return NULL;
+        ff->reserved_req = fuse_request_alloc();
-                } else {
+        if (unlikely(!ff->reserved_req)) {
-                        INIT_LIST_HEAD(&ff->write_entry);
+                kfree(ff);
-                        atomic_set(&ff->count, 0);
+                return NULL;
-                        spin_lock(&fc->lock);
-                        ff->kh = ++fc->khctr;
-                        spin_unlock(&fc->lock);
-                }
-                RB_CLEAR_NODE(&ff->polled_node);
-                init_waitqueue_head(&ff->poll_wait);
        }
+        INIT_LIST_HEAD(&ff->write_entry);
+        atomic_set(&ff->count, 0);
+        RB_CLEAR_NODE(&ff->polled_node);
+        init_waitqueue_head(&ff->poll_wait);
+        spin_lock(&fc->lock);
+        ff->kh = ++fc->khctr;
+        spin_unlock(&fc->lock);
        return ff;
 }
@@ -74,7 +79,7 @@ void fuse_file_free(struct fuse_file *ff)
        kfree(ff);
 }
-static struct fuse_file *fuse_file_get(struct fuse_file *ff)
+struct fuse_file *fuse_file_get(struct fuse_file *ff)
 {
        atomic_inc(&ff->count);
        return ff;
@@ -82,40 +87,65 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff)
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-        dput(req->misc.release.dentry);
+        path_put(&req->misc.release.path);
-        mntput(req->misc.release.vfsmount);
 }
 static void fuse_file_put(struct fuse_file *ff)
 {
        if (atomic_dec_and_test(&ff->count)) {
                struct fuse_req *req = ff->reserved_req;
-                struct inode *inode = req->misc.release.dentry->d_inode;
-                struct fuse_conn *fc = get_fuse_conn(inode);
                req->end = fuse_release_end;
-                fuse_request_send_background(fc, req);
+                fuse_request_send_background(ff->fc, req);
                kfree(ff);
        }
 }
-void fuse_finish_open(struct inode *inode, struct file *file,
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
-                      struct fuse_file *ff, struct fuse_open_out *outarg)
+                 bool isdir)
 {
-        if (outarg->open_flags & FOPEN_DIRECT_IO)
+        struct fuse_open_out outarg;
+        struct fuse_file *ff;
+        int err;
+        int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+        ff = fuse_file_alloc(fc);
+        if (!ff)
+                return -ENOMEM;
+        err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+        if (err) {
+                fuse_file_free(ff);
+                return err;
+        }
+        if (isdir)
+                outarg.open_flags &= ~FOPEN_DIRECT_IO;
+        ff->fh = outarg.fh;
+        ff->nodeid = nodeid;
+        ff->open_flags = outarg.open_flags;
+        file->private_data = fuse_file_get(ff);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_do_open);
+void fuse_finish_open(struct inode *inode, struct file *file)
+{
+        struct fuse_file *ff = file->private_data;
+        if (ff->open_flags & FOPEN_DIRECT_IO)
                file->f_op = &fuse_direct_io_file_operations;
-        if (!(outarg->open_flags & FOPEN_KEEP_CACHE))
+        if (!(ff->open_flags & FOPEN_KEEP_CACHE))
                invalidate_inode_pages2(inode->i_mapping);
-        if (outarg->open_flags & FOPEN_NONSEEKABLE)
+        if (ff->open_flags & FOPEN_NONSEEKABLE)
                nonseekable_open(inode, file);
-        ff->fh = outarg->fh;
-        file->private_data = fuse_file_get(ff);
 }
-int fuse_open_common(struct inode *inode, struct file *file, int isdir)
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
-        struct fuse_open_out outarg;
-        struct fuse_file *ff;
        int err;
        /* VFS checks this, but only _after_ ->open() */
@@ -126,78 +156,85 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
        if (err)
                return err;
-        ff = fuse_file_alloc(fc);
+        err = fuse_do_open(fc, get_node_id(inode), file, isdir);
-        if (!ff)
-                return -ENOMEM;
-        err = fuse_send_open(inode, file, isdir, &outarg);
        if (err)
-                fuse_file_free(ff);
+                return err;
-        else {
-                if (isdir)
-                        outarg.open_flags &= ~FOPEN_DIRECT_IO;
-                fuse_finish_open(inode, file, ff, &outarg);
-        }
-        return err;
+        fuse_finish_open(inode, file);
+        return 0;
 }
-void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode)
+static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
 {
+        struct fuse_conn *fc = ff->fc;
        struct fuse_req *req = ff->reserved_req;
        struct fuse_release_in *inarg = &req->misc.release.in;
+        spin_lock(&fc->lock);
+        list_del(&ff->write_entry);
+        if (!RB_EMPTY_NODE(&ff->polled_node))
+                rb_erase(&ff->polled_node, &fc->polled_files);
+        spin_unlock(&fc->lock);
+        wake_up_interruptible_sync(&ff->poll_wait);
        inarg->fh = ff->fh;
        inarg->flags = flags;
        req->in.h.opcode = opcode;
-        req->in.h.nodeid = nodeid;
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_release_in);
        req->in.args[0].value = inarg;
 }
-int fuse_release_common(struct inode *inode, struct file *file, int isdir)
+void fuse_release_common(struct file *file, int opcode)
 {
-        struct fuse_file *ff = file->private_data;
+        struct fuse_file *ff;
-        if (ff) {
+        struct fuse_req *req;
-                struct fuse_conn *fc = get_fuse_conn(inode);
-                struct fuse_req *req = ff->reserved_req;
-                fuse_release_fill(ff, get_node_id(inode), file->f_flags,
-                                  isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
-                /* Hold vfsmount and dentry until release is finished */
+        ff = file->private_data;
-                req->misc.release.vfsmount = mntget(file->f_path.mnt);
+        if (unlikely(!ff))
-                req->misc.release.dentry = dget(file->f_path.dentry);
+                return;
-                spin_lock(&fc->lock);
+        req = ff->reserved_req;
-                list_del(&ff->write_entry);
+        fuse_prepare_release(ff, file->f_flags, opcode);
-                if (!RB_EMPTY_NODE(&ff->polled_node))
-                        rb_erase(&ff->polled_node, &fc->polled_files);
-                spin_unlock(&fc->lock);
-                wake_up_interruptible_sync(&ff->poll_wait);
+        /* Hold vfsmount and dentry until release is finished */
-                /*
+        path_get(&file->f_path);
-                 * Normally this will send the RELEASE request,
+        req->misc.release.path = file->f_path;
-                 * however if some asynchronous READ or WRITE requests
-                 * are outstanding, the sending will be delayed
-                 */
-                fuse_file_put(ff);
-        }
-        /* Return value is ignored by VFS */
+        /*
-        return 0;
+         * Normally this will send the RELEASE request, however if
+         * some asynchronous READ or WRITE requests are outstanding,
+         * the sending will be delayed.
+         */
+        fuse_file_put(ff);
 }
 static int fuse_open(struct inode *inode, struct file *file)
 {
-        return fuse_open_common(inode, file, 0);
+        return fuse_open_common(inode, file, false);
 }
 static int fuse_release(struct inode *inode, struct file *file)
 {
-        return fuse_release_common(inode, file, 0);
+        fuse_release_common(file, FUSE_RELEASE);
+        /* return value is ignored by VFS */
+        return 0;
+}
+void fuse_sync_release(struct fuse_file *ff, int flags)
+{
+        WARN_ON(atomic_read(&ff->count) > 1);
+        fuse_prepare_release(ff, flags, FUSE_RELEASE);
+        ff->reserved_req->force = 1;
+        fuse_request_send(ff->fc, ff->reserved_req);
+        fuse_put_request(ff->fc, ff->reserved_req);
+        kfree(ff);
 }
+EXPORT_SYMBOL_GPL(fuse_sync_release);
 /*
 * Scramble the ID space with XTEA, so that the value of the files_struct
@@ -371,8 +408,8 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
        return fuse_fsync_common(file, de, datasync, 0);
 }
-void fuse_read_fill(struct fuse_req *req, struct file *file,
+void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
-                    struct inode *inode, loff_t pos, size_t count, int opcode)
+                    size_t count, int opcode)
 {
        struct fuse_read_in *inarg = &req->misc.read.in;
        struct fuse_file *ff = file->private_data;
@@ -382,7 +419,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
        inarg->size = count;
        inarg->flags = file->f_flags;
        req->in.h.opcode = opcode;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_read_in);
        req->in.args[0].value = inarg;
@@ -392,12 +429,12 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 }
 static size_t fuse_send_read(struct fuse_req *req, struct file *file,
-                             struct inode *inode, loff_t pos, size_t count,
+                             loff_t pos, size_t count, fl_owner_t owner)
-                             fl_owner_t owner)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = ff->fc;
-        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
+        fuse_read_fill(req, file, pos, count, FUSE_READ);
        if (owner != NULL) {
                struct fuse_read_in *inarg = &req->misc.read.in;
@@ -455,7 +492,7 @@ static int fuse_readpage(struct file *file, struct page *page)
        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
-        num_read = fuse_send_read(req, file, inode, pos, count, NULL);
+        num_read = fuse_send_read(req, file, pos, count, NULL);
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -504,19 +541,18 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                fuse_file_put(req->ff);
 }
-static void fuse_send_readpages(struct fuse_req *req, struct file *file,
+static void fuse_send_readpages(struct fuse_req *req, struct file *file)
-                                struct inode *inode)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = ff->fc;
        loff_t pos = page_offset(req->pages[0]);
        size_t count = req->num_pages << PAGE_CACHE_SHIFT;
        req->out.argpages = 1;
        req->out.page_zeroing = 1;
-        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
+        fuse_read_fill(req, file, pos, count, FUSE_READ);
        req->misc.read.attr_ver = fuse_get_attr_version(fc);
        if (fc->async_read) {
-                struct fuse_file *ff = file->private_data;
                req->ff = fuse_file_get(ff);
                req->end = fuse_readpages_end;
                fuse_request_send_background(fc, req);
@@ -546,7 +582,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
            (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
             (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
             req->pages[req->num_pages - 1]->index + 1 != page->index)) {
-                fuse_send_readpages(req, data->file, inode);
+                fuse_send_readpages(req, data->file);
                data->req = req = fuse_get_req(fc);
                if (IS_ERR(req)) {
                        unlock_page(page);
@@ -580,7 +616,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
        err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
        if (!err) {
                if (data.req->num_pages)
-                        fuse_send_readpages(data.req, file, inode);
+                        fuse_send_readpages(data.req, file);
                else
                        fuse_put_request(fc, data.req);
        }
@@ -607,24 +643,19 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        return generic_file_aio_read(iocb, iov, nr_segs, pos);
 }
-static void fuse_write_fill(struct fuse_req *req, struct file *file,
+static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
-                            struct fuse_file *ff, struct inode *inode,
+                            loff_t pos, size_t count)
-                            loff_t pos, size_t count, int writepage)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_write_in *inarg = &req->misc.write.in;
        struct fuse_write_out *outarg = &req->misc.write.out;
-        memset(inarg, 0, sizeof(struct fuse_write_in));
        inarg->fh = ff->fh;
        inarg->offset = pos;
        inarg->size = count;
-        inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
-        inarg->flags = file ? file->f_flags : 0;
        req->in.h.opcode = FUSE_WRITE;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 2;
-        if (fc->minor < 9)
+        if (ff->fc->minor < 9)
                req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
        else
                req->in.args[0].size = sizeof(struct fuse_write_in);
@@ -636,13 +667,15 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
 }
 static size_t fuse_send_write(struct fuse_req *req, struct file *file,
-                              struct inode *inode, loff_t pos, size_t count,
+                              loff_t pos, size_t count, fl_owner_t owner)
-                              fl_owner_t owner)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
-        fuse_write_fill(req, file, file->private_data, inode, pos, count, 0);
+        struct fuse_conn *fc = ff->fc;
+        struct fuse_write_in *inarg = &req->misc.write.in;
+        fuse_write_fill(req, ff, pos, count);
+        inarg->flags = file->f_flags;
        if (owner != NULL) {
-                struct fuse_write_in *inarg = &req->misc.write.in;
                inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
                inarg->lock_owner = fuse_lock_owner_id(fc, owner);
        }
@@ -700,7 +733,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
        req->num_pages = 1;
        req->pages[0] = page;
        req->page_offset = offset;
-        nres = fuse_send_write(req, file, inode, pos, count, NULL);
+        nres = fuse_send_write(req, file, pos, count, NULL);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err && !nres)
@@ -741,7 +774,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
        for (i = 0; i < req->num_pages; i++)
                fuse_wait_on_page_writeback(inode, req->pages[i]->index);
-        res = fuse_send_write(req, file, inode, pos, count, NULL);
+        res = fuse_send_write(req, file, pos, count, NULL);
        offset = req->page_offset;
        count = res;
@@ -979,25 +1012,23 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
        return 0;
 }
-static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
-                              size_t count, loff_t *ppos, int write)
+                       size_t count, loff_t *ppos, int write)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
+        struct fuse_file *ff = file->private_data;
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_conn *fc = ff->fc;
        size_t nmax = write ? fc->max_write : fc->max_read;
        loff_t pos = *ppos;
        ssize_t res = 0;
        struct fuse_req *req;
-        if (is_bad_inode(inode))
-                return -EIO;
        req = fuse_get_req(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
        while (count) {
                size_t nres;
+                fl_owner_t owner = current->files;
                size_t nbytes = min(count, nmax);
                int err = fuse_get_user_pages(req, buf, &nbytes, write);
                if (err) {
@@ -1006,11 +1037,10 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
                }
                if (write)
-                        nres = fuse_send_write(req, file, inode, pos, nbytes,
+                        nres = fuse_send_write(req, file, pos, nbytes, owner);
-                                               current->files);
                else
-                        nres = fuse_send_read(req, file, inode, pos, nbytes,
+                        nres = fuse_send_read(req, file, pos, nbytes, owner);
-                                              current->files);
                fuse_release_user_pages(req, !write);
                if (req->out.h.error) {
                        if (!res)
@@ -1034,20 +1064,27 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
                }
        }
        fuse_put_request(fc, req);
-        if (res > 0) {
+        if (res > 0)
-                if (write)
-                        fuse_write_update_size(inode, pos);
                *ppos = pos;
-        }
-        fuse_invalidate_attr(inode);
        return res;
 }
+EXPORT_SYMBOL_GPL(fuse_direct_io);
 static ssize_t fuse_direct_read(struct file *file, char __user *buf,
                                     size_t count, loff_t *ppos)
 {
-        return fuse_direct_io(file, buf, count, ppos, 0);
+        ssize_t res;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        if (is_bad_inode(inode))
+                return -EIO;
+        res = fuse_direct_io(file, buf, count, ppos, 0);
+        fuse_invalidate_attr(inode);
+        return res;
 }
 static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
@@ -1055,12 +1092,22 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        ssize_t res;
+        if (is_bad_inode(inode))
+                return -EIO;
        /* Don't allow parallel writes to the same file */
        mutex_lock(&inode->i_mutex);
        res = generic_write_checks(file, ppos, &count, 0);
-        if (!res)
+        if (!res) {
                res = fuse_direct_io(file, buf, count, ppos, 1);
+                if (res > 0)
+                        fuse_write_update_size(inode, *ppos);
+        }
        mutex_unlock(&inode->i_mutex);
+        fuse_invalidate_attr(inode);
        return res;
 }
@@ -1177,9 +1224,10 @@ static int fuse_writepage_locked(struct page *page)
        req->ff = fuse_file_get(ff);
        spin_unlock(&fc->lock);
-        fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
+        fuse_write_fill(req, ff, page_offset(page), 0);
        copy_highpage(tmp_page, page);
+        req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
        req->in.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = tmp_page;
@@ -1603,12 +1651,11 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 * limits ioctl data transfers to well-formed ioctls and is the forced
 * behavior for all FUSE servers.
 */
-static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
-                               unsigned long arg, unsigned int flags)
+                   unsigned int flags)
 {
-        struct inode *inode = file->f_dentry->d_inode;
        struct fuse_file *ff = file->private_data;
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_conn *fc = ff->fc;
        struct fuse_ioctl_in inarg = {
                .fh = ff->fh,
                .cmd = cmd,
@@ -1627,13 +1674,6 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
        /* assume all the iovs returned by client always fits in a page */
        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
-        if (!fuse_allow_task(fc, current))
-                return -EACCES;
-        err = -EIO;
-        if (is_bad_inode(inode))
-                goto out;
        err = -ENOMEM;
        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
        iov_page = alloc_page(GFP_KERNEL);
@@ -1694,7 +1734,7 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
        /* okay, let's send it to the client */
        req->in.h.opcode = FUSE_IOCTL;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1777,17 +1817,33 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
        return err ? err : outarg.result;
 }
+EXPORT_SYMBOL_GPL(fuse_do_ioctl);
+static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
+                                   unsigned long arg, unsigned int flags)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        if (!fuse_allow_task(fc, current))
+                return -EACCES;
+        if (is_bad_inode(inode))
+                return -EIO;
+        return fuse_do_ioctl(file, cmd, arg, flags);
+}
 static long fuse_file_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg)
 {
-        return fuse_file_do_ioctl(file, cmd, arg, 0);
+        return fuse_file_ioctl_common(file, cmd, arg, 0);
 }
 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
                                   unsigned long arg)
 {
-        return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
+        return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
 }
 /*
@@ -1841,11 +1897,10 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
        spin_unlock(&fc->lock);
 }
-static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+unsigned fuse_file_poll(struct file *file, poll_table *wait)
 {
-        struct inode *inode = file->f_dentry->d_inode;
        struct fuse_file *ff = file->private_data;
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_conn *fc = ff->fc;
        struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
        struct fuse_poll_out outarg;
        struct fuse_req *req;
@@ -1867,10 +1922,10 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
        req = fuse_get_req(fc);
        if (IS_ERR(req))
-                return PTR_ERR(req);
+                return POLLERR;
        req->in.h.opcode = FUSE_POLL;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1889,6 +1944,7 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
        }
        return POLLERR;
 }
+EXPORT_SYMBOL_GPL(fuse_file_poll);
 /*
 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6fc5aedaa0d5..52b641fc0faf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -97,8 +97,13 @@ struct fuse_inode {
        struct list_head writepages;
 };
+struct fuse_conn;
 /** FUSE specific file data */
 struct fuse_file {
+        /** Fuse connection for this file */
+        struct fuse_conn *fc;
        /** Request reserved for flush and release */
        struct fuse_req *reserved_req;
@@ -108,9 +113,15 @@ struct fuse_file {
        /** File handle used by userspace */
        u64 fh;
+        /** Node id of this file */
+        u64 nodeid;
        /** Refcount */
        atomic_t count;
+        /** FOPEN_* flags returned by open */
+        u32 open_flags;
        /** Entry on inode's write_files list */
        struct list_head write_entry;
@@ -185,8 +196,6 @@ enum fuse_req_state {
        FUSE_REQ_FINISHED
 };
-struct fuse_conn;
 /**
 * A request to the client
 */
@@ -248,11 +257,12 @@ struct fuse_req {
                struct fuse_forget_in forget_in;
                struct {
                        struct fuse_release_in in;
-                        struct vfsmount *vfsmount;
+                        struct path path;
-                        struct dentry *dentry;
                } release;
                struct fuse_init_in init_in;
                struct fuse_init_out init_out;
+                struct cuse_init_in cuse_init_in;
+                struct cuse_init_out cuse_init_out;
                struct {
                        struct fuse_read_in in;
                        u64 attr_ver;
@@ -386,6 +396,9 @@ struct fuse_conn {
        /** Filesystem supports NFS exporting.  Only set in INIT */
        unsigned export_support:1;
+        /** Set if bdi is valid */
+        unsigned bdi_initialized:1;
        /*
         * The following bitfields are only for optimization purposes
         * and hence races in setting them will not cause malfunction
@@ -433,6 +446,9 @@ struct fuse_conn {
        /** Do multi-page cached writes */
        unsigned big_writes:1;
+        /** Don't apply umask to creation modes */
+        unsigned dont_mask:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
@@ -468,6 +484,12 @@ struct fuse_conn {
        /** Called on final put */
        void (*release)(struct fuse_conn *);
+        /** Super block for this connection. */
+        struct super_block *sb;
+        /** Read/write semaphore to hold when accessing sb. */
+        struct rw_semaphore killsb;
 };
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -496,6 +518,11 @@ extern const struct file_operations fuse_dev_operations;
 extern const struct dentry_operations fuse_dentry_operations;
 /**
+ * Inode to nodeid comparison.
+ */
+int fuse_inode_eq(struct inode *inode, void *_nodeidp);
+/**
 * Get a filled in inode
 */
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
@@ -515,25 +542,24 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 * Initialize READ or READDIR request
 */
 void fuse_read_fill(struct fuse_req *req, struct file *file,
-                    struct inode *inode, loff_t pos, size_t count, int opcode);
+                    loff_t pos, size_t count, int opcode);
 /**
 * Send OPEN or OPENDIR request
 */
-int fuse_open_common(struct inode *inode, struct file *file, int isdir);
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_get(struct fuse_file *ff);
 void fuse_file_free(struct fuse_file *ff);
-void fuse_finish_open(struct inode *inode, struct file *file,
+void fuse_finish_open(struct inode *inode, struct file *file);
-                      struct fuse_file *ff, struct fuse_open_out *outarg);
-/** Fill in ff->reserved_req with a RELEASE request */
+void fuse_sync_release(struct fuse_file *ff, int flags);
-void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode);
 /**
 * Send RELEASE or RELEASEDIR request
 */
-int fuse_release_common(struct inode *inode, struct file *file, int isdir);
+void fuse_release_common(struct file *file, int opcode);
 /**
 * Send FSYNC or FSYNCDIR request
@@ -652,10 +678,12 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
 */
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+void fuse_conn_kill(struct fuse_conn *fc);
 /**
 * Initialize fuse_conn
 */
-int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+void fuse_conn_init(struct fuse_conn *fc);
 /**
 * Release reference to fuse_conn
@@ -694,4 +722,26 @@ void fuse_release_nowrite(struct inode *inode);
 u64 fuse_get_attr_version(struct fuse_conn *fc);
+/**
+ * File-system tells the kernel to invalidate cache for the given node id.
+ */
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+                             loff_t offset, loff_t len);
+/**
+ * File-system tells the kernel to invalidate parent attributes and
+ * the dentry matching parent/name.
+ */
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+                             struct qstr *name);
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+                 bool isdir);
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+                       size_t count, loff_t *ppos, int write);
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+                   unsigned int flags);
+unsigned fuse_file_poll(struct file *file, poll_table *wait);
+int fuse_dev_release(struct inode *inode, struct file *file);
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 91f7c85f1ffd..f91ccc4a189d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -19,7 +19,6 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -207,7 +206,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
                BUG();
 }
-static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+int fuse_inode_eq(struct inode *inode, void *_nodeidp)
 {
        u64 nodeid = *(u64 *) _nodeidp;
        if (get_node_id(inode) == nodeid)
@@ -258,11 +257,34 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
        return inode;
 }
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+                             loff_t offset, loff_t len)
+{
+        struct inode *inode;
+        pgoff_t pg_start;
+        pgoff_t pg_end;
+        inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+        if (!inode)
+                return -ENOENT;
+        fuse_invalidate_attr(inode);
+        if (offset >= 0) {
+                pg_start = offset >> PAGE_CACHE_SHIFT;
+                if (len <= 0)
+                        pg_end = -1;
+                else
+                        pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+                invalidate_inode_pages2_range(inode->i_mapping,
+                                              pg_start, pg_end);
+        }
+        iput(inode);
+        return 0;
+}
 static void fuse_umount_begin(struct super_block *sb)
 {
-        lock_kernel();
        fuse_abort_conn(get_fuse_conn_super(sb));
-        unlock_kernel();
 }
 static void fuse_send_destroy(struct fuse_conn *fc)
@@ -277,11 +299,14 @@ static void fuse_send_destroy(struct fuse_conn *fc)
        }
 }
-static void fuse_put_super(struct super_block *sb)
+static void fuse_bdi_destroy(struct fuse_conn *fc)
 {
-        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        if (fc->bdi_initialized)
+                bdi_destroy(&fc->bdi);
+}
-        fuse_send_destroy(fc);
+void fuse_conn_kill(struct fuse_conn *fc)
+{
        spin_lock(&fc->lock);
        fc->connected = 0;
        fc->blocked = 0;
@@ -295,7 +320,16 @@ static void fuse_put_super(struct super_block *sb)
        list_del(&fc->entry);
        fuse_ctl_remove_conn(fc);
        mutex_unlock(&fuse_mutex);
-        bdi_destroy(&fc->bdi);
+        fuse_bdi_destroy(fc);
+}
+EXPORT_SYMBOL_GPL(fuse_conn_kill);
+static void fuse_put_super(struct super_block *sb)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        fuse_send_destroy(fc);
+        fuse_conn_kill(fc);
        fuse_conn_put(fc);
 }
@@ -466,13 +500,12 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
-int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
+void fuse_conn_init(struct fuse_conn *fc)
 {
-        int err;
        memset(fc, 0, sizeof(*fc));
        spin_lock_init(&fc->lock);
        mutex_init(&fc->inst_mutex);
+        init_rwsem(&fc->killsb);
        atomic_set(&fc->count, 1);
        init_waitqueue_head(&fc->waitq);
        init_waitqueue_head(&fc->blocked_waitq);
@@ -484,49 +517,12 @@ int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
        INIT_LIST_HEAD(&fc->bg_queue);
        INIT_LIST_HEAD(&fc->entry);
        atomic_set(&fc->num_waiting, 0);
-        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-        fc->bdi.unplug_io_fn = default_unplug_io_fn;
-        /* fuse does it's own writeback accounting */
-        fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
        fc->khctr = 0;
        fc->polled_files = RB_ROOT;
-        fc->dev = sb->s_dev;
-        err = bdi_init(&fc->bdi);
-        if (err)
-                goto error_mutex_destroy;
-        if (sb->s_bdev) {
-                err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
-                                   MAJOR(fc->dev), MINOR(fc->dev));
-        } else {
-                err = bdi_register_dev(&fc->bdi, fc->dev);
-        }
-        if (err)
-                goto error_bdi_destroy;
-        /*
-         * For a single fuse filesystem use max 1% of dirty +
-         * writeback threshold.
-         *
-         * This gives about 1M of write buffer for memory maps on a
-         * machine with 1G and 10% dirty_ratio, which should be more
-         * than enough.
-         *
-         * Privileged users can raise it by writing to
-         *
-         *    /sys/class/bdi/<bdi>/max_ratio
-         */
-        bdi_set_max_ratio(&fc->bdi, 1);
        fc->reqctr = 0;
        fc->blocked = 1;
        fc->attr_version = 1;
        get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
-        return 0;
- error_bdi_destroy:
-        bdi_destroy(&fc->bdi);
- error_mutex_destroy:
-        mutex_destroy(&fc->inst_mutex);
-        return err;
 }
 EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -539,12 +535,14 @@ void fuse_conn_put(struct fuse_conn *fc)
                fc->release(fc);
        }
 }
+EXPORT_SYMBOL_GPL(fuse_conn_put);
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
 {
        atomic_inc(&fc->count);
        return fc;
 }
+EXPORT_SYMBOL_GPL(fuse_conn_get);
 static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 {
@@ -753,6 +751,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                        }
                        if (arg->flags & FUSE_BIG_WRITES)
                                fc->big_writes = 1;
+                        if (arg->flags & FUSE_DONT_MASK)
+                                fc->dont_mask = 1;
                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
                        fc->no_lock = 1;
@@ -776,7 +776,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
-                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES;
+                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
@@ -797,6 +797,48 @@ static void fuse_free_conn(struct fuse_conn *fc)
        kfree(fc);
 }
+static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
+{
+        int err;
+        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+        fc->bdi.unplug_io_fn = default_unplug_io_fn;
+        /* fuse does it's own writeback accounting */
+        fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+        err = bdi_init(&fc->bdi);
+        if (err)
+                return err;
+        fc->bdi_initialized = 1;
+        if (sb->s_bdev) {
+                err =  bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+                                    MAJOR(fc->dev), MINOR(fc->dev));
+        } else {
+                err = bdi_register_dev(&fc->bdi, fc->dev);
+        }
+        if (err)
+                return err;
+        /*
+         * For a single fuse filesystem use max 1% of dirty +
+         * writeback threshold.
+         *
+         * This gives about 1M of write buffer for memory maps on a
+         * machine with 1G and 10% dirty_ratio, which should be more
+         * than enough.
+         *
+         * Privileged users can raise it by writing to
+         *
+         *    /sys/class/bdi/<bdi>/max_ratio
+         */
+        bdi_set_max_ratio(&fc->bdi, 1);
+        return 0;
+}
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct fuse_conn *fc;
@@ -843,11 +885,18 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (!fc)
                goto err_fput;
-        err = fuse_conn_init(fc, sb);
+        fuse_conn_init(fc);
-        if (err) {
-                kfree(fc);
+        fc->dev = sb->s_dev;
-                goto err_fput;
+        fc->sb = sb;
-        }
+        err = fuse_bdi_init(fc, sb);
+        if (err)
+                goto err_put_conn;
+        /* Handle umasking inside the fuse code */
+        if (sb->s_flags & MS_POSIXACL)
+                fc->dont_mask = 1;
+        sb->s_flags |= MS_POSIXACL;
        fc->release = fuse_free_conn;
        fc->flags = d.flags;
@@ -911,7 +960,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 err_put_root:
        dput(root_dentry);
 err_put_conn:
-        bdi_destroy(&fc->bdi);
+        fuse_bdi_destroy(fc);
        fuse_conn_put(fc);
 err_fput:
        fput(file);
@@ -926,12 +975,25 @@ static int fuse_get_sb(struct file_system_type *fs_type,
        return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
+static void fuse_kill_sb_anon(struct super_block *sb)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        if (fc) {
+                down_write(&fc->killsb);
+                fc->sb = NULL;
+                up_write(&fc->killsb);
+        }
+        kill_anon_super(sb);
+}
 static struct file_system_type fuse_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuse",
        .fs_flags       = FS_HAS_SUBTYPE,
        .get_sb         = fuse_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = fuse_kill_sb_anon,
 };
 #ifdef CONFIG_BLOCK
@@ -943,11 +1005,24 @@ static int fuse_get_sb_blk(struct file_system_type *fs_type,
                           mnt);
 }
+static void fuse_kill_sb_blk(struct super_block *sb)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        if (fc) {
+                down_write(&fc->killsb);
+                fc->sb = NULL;
+                up_write(&fc->killsb);
+        }
+        kill_block_super(sb);
+}
 static struct file_system_type fuseblk_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuseblk",
        .get_sb         = fuse_get_sb_blk,
-        .kill_sb        = kill_block_super,
+        .kill_sb        = fuse_kill_sb_blk,
        .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3a981b7f64ca..5971359d2090 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,12 +1,13 @@
 config GFS2_FS
        tristate "GFS2 file system support"
-        depends on EXPERIMENTAL && (64BIT || LBD)
+        depends on EXPERIMENTAL && (64BIT || LBDAF)
        select DLM if GFS2_FS_LOCKING_DLM
        select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
        select SYSFS if GFS2_FS_LOCKING_DLM
        select IP_SCTP if DLM_SCTP
        select FS_POSIX_ACL
        select CRC32
+        select SLOW_WORK
        help
          A cluster filesystem.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index a851ea4bdf70..3da2f1f4f738 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,8 +1,9 @@
+EXTRA_CFLAGS := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
        glops.o inode.o log.o lops.o main.o meta_io.o \
-        mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+        aops.o dentry.o export.o file.o \
-        ops_fstype.o ops_inode.o ops_super.o quota.o \
+        ops_fstype.o ops_inode.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
 gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/aops.c
index a6dde1751e17..7ebae9a4ecc0 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/aops.c
@@ -28,7 +28,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "trans.h"
 #include "rgrp.h"
@@ -625,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
        struct gfs2_inode *ip = GFS2_I(mapping->host);
        struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
        int alloc_required;
        int error = 0;
@@ -638,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        error = gfs2_glock_nq(&ip->i_gh);
        if (unlikely(error))
                goto out_uninit;
+        if (&ip->i_inode == sdp->sd_rindex) {
+                error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+                                           GL_NOCACHE, &m_ip->i_gh);
+                if (unlikely(error)) {
+                        gfs2_glock_dq(&ip->i_gh);
+                        goto out_uninit;
+                }
+        }
        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
        if (error)
@@ -668,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                rblocks += data_blocks ? data_blocks : 1;
        if (ind_blocks || data_blocks)
                rblocks += RES_STATFS + RES_QUOTA;
+        if (&ip->i_inode == sdp->sd_rindex)
+                rblocks += 2 * RES_STATFS;
        error = gfs2_trans_begin(sdp, rblocks,
                                 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -713,6 +723,10 @@ out_alloc_put:
                gfs2_alloc_put(ip);
        }
 out_unlock:
+        if (&ip->i_inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
 out_uninit:
        gfs2_holder_uninit(&ip->i_gh);
@@ -726,14 +740,21 @@ out_uninit:
 static void adjust_fs_space(struct inode *inode)
 {
        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        struct buffer_head *m_bh, *l_bh;
        u64 fs_total, new_free;
        /* Total up the file system space, according to the latest rindex. */
        fs_total = gfs2_ri_total(sdp);
+        if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
+                return;
        spin_lock(&sdp->sd_statfs_spin);
+        gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                              sizeof(struct gfs2_dinode));
        if (fs_total > (m_sc->sc_total + l_sc->sc_total))
                new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
        else
@@ -742,6 +763,13 @@ static void adjust_fs_space(struct inode *inode)
        fs_warn(sdp, "File system extended by %llu blocks.\n",
                (unsigned long long)new_free);
        gfs2_statfs_change(sdp, new_free, new_free, 0);
+        if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
+                goto out;
+        update_statfs(sdp, m_bh, l_bh);
+        brelse(l_bh);
+out:
+        brelse(m_bh);
 }
 /**
@@ -764,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        u64 to = pos + copied;
        void *kaddr;
        unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
@@ -781,10 +810,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        unlock_page(page);
        page_cache_release(page);
-        if (inode->i_size < to) {
+        if (copied) {
-                i_size_write(inode, to);
+                if (inode->i_size < to) {
-                ip->i_disksize = inode->i_size;
+                        i_size_write(inode, to);
-                di->di_size = cpu_to_be64(inode->i_size);
+                        ip->i_disksize = inode->i_size;
+                }
+                gfs2_dinode_out(ip, di);
                mark_inode_dirty(inode);
        }
@@ -793,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        brelse(dibh);
        gfs2_trans_end(sdp);
+        if (inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
        gfs2_holder_uninit(&ip->i_gh);
        return copied;
@@ -822,9 +857,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        struct inode *inode = page->mapping->host;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
-        struct gfs2_dinode *di;
        unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
        unsigned int to = from + len;
        int ret;
@@ -847,11 +882,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
                gfs2_page_add_databufs(ip, page, from, to);
        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+        if (ret > 0) {
-        if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
+                if (inode->i_size > ip->i_disksize)
-                di = (struct gfs2_dinode *)dibh->b_data;
+                        ip->i_disksize = inode->i_size;
-                ip->i_disksize = inode->i_size;
+                gfs2_dinode_out(ip, dibh->b_data);
-                di->di_size = cpu_to_be64(inode->i_size);
                mark_inode_dirty(inode);
        }
@@ -866,6 +900,10 @@ failed:
                gfs2_quota_unlock(ip);
                gfs2_alloc_put(ip);
        }
+        if (inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
        gfs2_holder_uninit(&ip->i_gh);
        return ret;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3a5d3f883e10..6d47379e794b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -25,7 +25,7 @@
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
-#include "ops_address.h"
+#include "trace_gfs2.h"
 /* This doesn't need to be that large as max 64 bit pointers in a 4k
 * block is 512, so __u16 is fine for that. It saves stack space to
@@ -136,7 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
                   and write it out to disk */
                unsigned int n = 1;
-                block = gfs2_alloc_block(ip, &n);
+                error = gfs2_alloc_block(ip, &block, &n);
+                if (error)
+                        goto out_brelse;
                if (isdir) {
                        gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
                        error = gfs2_dir_get_new_buffer(ip, block, &bh);
@@ -476,8 +478,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
        blks = dblks + iblks;
        i = sheight;
        do {
+                int error;
                n = blks - alloced;
-                bn = gfs2_alloc_block(ip, &n);
+                error = gfs2_alloc_block(ip, &bn, &n);
+                if (error)
+                        return error;
                alloced += n;
                if (state != ALLOC_DATA || gfs2_is_jdata(ip))
                        gfs2_trans_add_unrevoke(sdp, bn, n);
@@ -585,6 +590,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
        clear_buffer_mapped(bh_map);
        clear_buffer_new(bh_map);
        clear_buffer_boundary(bh_map);
+        trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
        if (gfs2_is_dir(ip)) {
                bsize = sdp->sd_jbsize;
                arr = sdp->sd_jheightsize;
@@ -619,6 +625,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
        ret = 0;
 out:
        release_metapath(&mp);
+        trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
        bmap_unlock(ip, create);
        return ret;
@@ -1008,7 +1015,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
                gfs2_trans_add_bh(ip->i_gl, bh, 0);
        zero_user(page, offset, length);
+        mark_buffer_dirty(bh);
 unlock:
        unlock_page(page);
        page_cache_release(page);
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..022c66cd5606 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/dentry.c
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index aef4d0c06748..297d7e5cebad 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        unsigned int n = 1;
-        u64 bn = gfs2_alloc_block(ip, &n);
+        u64 bn;
-        struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+        int error;
+        struct buffer_head *bh;
        struct gfs2_leaf *leaf;
        struct gfs2_dirent *dent;
        struct qstr name = { .name = "", .len = 0, .hash = 0 };
+        error = gfs2_alloc_block(ip, &bn, &n);
+        if (error)
+                return NULL;
+        bh = gfs2_meta_new(ip->i_gl, bn);
        if (!bh)
                return NULL;
        gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
        gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 899763aed217..07ea9529adda 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
        struct gfs2_ea_header *ea;
        unsigned int n = 1;
        u64 block;
+        int error;
-        block = gfs2_alloc_block(ip, &n);
+        error = gfs2_alloc_block(ip, &block, &n);
+        if (error)
+                return error;
        gfs2_trans_add_unrevoke(sdp, block, 1);
        *bhp = gfs2_meta_new(ip->i_gl, block);
        gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                    struct gfs2_ea_request *er)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int error;
        ea->ea_data_len = cpu_to_be32(er->er_data_len);
        ea->ea_name_len = er->er_name_len;
@@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                        int mh_size = sizeof(struct gfs2_meta_header);
                        unsigned int n = 1;
-                        block = gfs2_alloc_block(ip, &n);
+                        error = gfs2_alloc_block(ip, &block, &n);
+                        if (error)
+                                return error;
                        gfs2_trans_add_unrevoke(sdp, block, 1);
                        bh = gfs2_meta_new(ip->i_gl, block);
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        } else {
                u64 blk;
                unsigned int n = 1;
-                blk = gfs2_alloc_block(ip, &n);
+                error = gfs2_alloc_block(ip, &blk, &n);
+                if (error)
+                        return error;
                gfs2_trans_add_unrevoke(sdp, blk, 1);
                indbh = gfs2_meta_new(ip->i_gl, blk);
                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/export.c
index 9200ef221716..9200ef221716 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/export.c
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/file.c
index 5d82e91887e3..73318a3ce6f1 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/file.c
@@ -39,7 +39,6 @@
 #include "trans.h"
 #include "util.h"
 #include "eaops.h"
-#include "ops_address.h"
 /**
 * gfs2_llseek - seek to a location in a file
@@ -425,33 +424,36 @@ static struct vm_operations_struct gfs2_vm_ops = {
        .page_mkwrite = gfs2_page_mkwrite,
 };
 /**
 * gfs2_mmap -
 * @file: The file to map
 * @vma: The VMA which described the mapping
 *
- * Returns: 0 or error code
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
 */
 static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-        struct gfs2_holder i_gh;
-        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+        if (!(file->f_flags & O_NOATIME)) {
-        error = gfs2_glock_nq(&i_gh);
+                struct gfs2_holder i_gh;
-        if (error) {
+                int error;
-                gfs2_holder_uninit(&i_gh);
-                return error;
-        }
+                gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+                error = gfs2_glock_nq(&i_gh);
+                file_accessed(file);
+                if (error == 0)
+                        gfs2_glock_dq_uninit(&i_gh);
+        }
        vma->vm_ops = &gfs2_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
-        gfs2_glock_dq_uninit(&i_gh);
+        return 0;
-        return error;
 }
 /**
@@ -692,12 +694,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
-        if (__mandatory_lock(&ip->i_inode))
+        if (fl->fl_type & LOCK_MAND)
-                return -ENOLCK;
+                return -EOPNOTSUPP;
        if (fl->fl_type == F_UNLCK) {
                do_unflock(file, fl);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ff4981090489..8b674b1f3a55 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -39,6 +39,8 @@
 #include "super.h"
 #include "util.h"
 #include "bmap.h"
+#define CREATE_TRACE_POINTS
+#include "trace_gfs2.h"
 struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
@@ -61,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
+struct workqueue_struct *gfs2_delete_workqueue;
 static LIST_HEAD(lru_list);
 static atomic_t lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(lru_lock);
@@ -155,7 +158,7 @@ static void glock_free(struct gfs2_glock *gl)
        if (aspace)
                gfs2_aspace_put(aspace);
+        trace_gfs2_glock_put(gl);
        sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
 }
@@ -165,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl)
 *
 */
-static void gfs2_glock_hold(struct gfs2_glock *gl)
+void gfs2_glock_hold(struct gfs2_glock *gl)
 {
        GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
        atomic_inc(&gl->gl_ref);
 }
 /**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int demote_ok(const struct gfs2_glock *gl)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (gl->gl_state == LM_ST_UNLOCKED)
+                return 0;
+        if (!list_empty(&gl->gl_holders))
+                return 0;
+        if (glops->go_demote_ok)
+                return glops->go_demote_ok(gl);
+        return 1;
+}
+/**
 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
 * @gl: the glock
 *
@@ -179,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
+        int may_reclaim;
+        may_reclaim = (demote_ok(gl) &&
+                       (atomic_read(&gl->gl_ref) == 1 ||
+                        (gl->gl_name.ln_type == LM_TYPE_INODE &&
+                         atomic_read(&gl->gl_ref) <= 2)));
        spin_lock(&lru_lock);
-        if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+        if (list_empty(&gl->gl_lru) && may_reclaim) {
                list_add_tail(&gl->gl_lru, &lru_list);
                atomic_inc(&lru_count);
        }
@@ -188,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 }
 /**
+ * gfs2_glock_put_nolock() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ * This function should only be used if the caller has its own reference
+ * to the glock, in addition to the one it is dropping.
+ */
+void gfs2_glock_put_nolock(struct gfs2_glock *gl)
+{
+        if (atomic_dec_and_test(&gl->gl_ref))
+                GLOCK_BUG_ON(gl, 1);
+        gfs2_glock_schedule_for_reclaim(gl);
+}
+/**
 * gfs2_glock_put() - Decrement reference count on glock
 * @gl: The glock to put
 *
@@ -212,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl)
                rv = 1;
                goto out;
        }
-        /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+        spin_lock(&gl->gl_spin);
-        if (atomic_read(&gl->gl_ref) == 2)
+        gfs2_glock_schedule_for_reclaim(gl);
-                gfs2_glock_schedule_for_reclaim(gl);
+        spin_unlock(&gl->gl_spin);
        write_unlock(gl_lock_addr(gl->gl_hash));
 out:
        return rv;
@@ -317,14 +360,17 @@ restart:
                                                return 2;
                                        gh->gh_error = ret;
                                        list_del_init(&gh->gh_list);
+                                        trace_gfs2_glock_queue(gh, 0);
                                        gfs2_holder_wake(gh);
                                        goto restart;
                                }
                                set_bit(HIF_HOLDER, &gh->gh_iflags);
+                                trace_gfs2_promote(gh, 1);
                                gfs2_holder_wake(gh);
                                goto restart;
                        }
                        set_bit(HIF_HOLDER, &gh->gh_iflags);
+                        trace_gfs2_promote(gh, 0);
                        gfs2_holder_wake(gh);
                        continue;
                }
@@ -354,6 +400,7 @@ static inline void do_error(struct gfs2_glock *gl, const int ret)
                else
                        continue;
                list_del_init(&gh->gh_list);
+                trace_gfs2_glock_queue(gh, 0);
                gfs2_holder_wake(gh);
        }
 }
@@ -392,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
                if (held2)
                        gfs2_glock_hold(gl);
                else
-                        gfs2_glock_put(gl);
+                        gfs2_glock_put_nolock(gl);
        }
        gl->gl_state = new_state;
@@ -422,6 +469,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
        int rv;
        spin_lock(&gl->gl_spin);
+        trace_gfs2_glock_state_change(gl, state);
        state_change(gl, state);
        gh = find_first_waiter(gl);
@@ -626,12 +674,35 @@ out:
 out_sched:
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-                gfs2_glock_put(gl);
+                gfs2_glock_put_nolock(gl);
 out_unlock:
        clear_bit(GLF_LOCK, &gl->gl_flags);
        goto out;
 }
+static void delete_work_func(struct work_struct *work)
+{
+        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = NULL;
+        struct inode *inode;
+        u64 no_addr = 0;
+        spin_lock(&gl->gl_spin);
+        ip = (struct gfs2_inode *)gl->gl_object;
+        if (ip)
+                no_addr = ip->i_no_addr;
+        spin_unlock(&gl->gl_spin);
+        if (ip) {
+                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
+                if (inode) {
+                        d_prune_aliases(inode);
+                        iput(inode);
+                }
+        }
+        gfs2_glock_put(gl);
+}
 static void glock_work_func(struct work_struct *work)
 {
        unsigned long delay = 0;
@@ -710,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_sbd = sdp;
        gl->gl_aspace = NULL;
        INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
+        INIT_WORK(&gl->gl_delete, delete_work_func);
        /* If this glock protects actual on-disk data or metadata blocks,
           create a VFS inode to manage the pages/buffers holding them. */
@@ -796,22 +868,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
        gh->gh_ip = 0;
 }
-static int just_schedule(void *word)
+/**
+ * gfs2_glock_holder_wait
+ * @word: unused
+ *
+ * This function and gfs2_glock_demote_wait both show up in the WCHAN
+ * field. Thus I've separated these otherwise identical functions in
+ * order to be more informative to the user.
+ */
+static int gfs2_glock_holder_wait(void *word)
 {
        schedule();
        return 0;
 }
+static int gfs2_glock_demote_wait(void *word)
+{
+        schedule();
+        return 0;
+}
 static void wait_on_holder(struct gfs2_holder *gh)
 {
        might_sleep();
-        wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
 }
 static void wait_on_demote(struct gfs2_glock *gl)
 {
        might_sleep();
-        wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
 }
 /**
@@ -836,6 +923,9 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
                        gl->gl_demote_state != state) {
                gl->gl_demote_state = LM_ST_UNLOCKED;
        }
+        if (gl->gl_ops->go_callback)
+                gl->gl_ops->go_callback(gl);
+        trace_gfs2_demote_rq(gl);
 }
 /**
@@ -921,6 +1011,7 @@ fail:
                        goto do_cancel;
                return;
        }
+        trace_gfs2_glock_queue(gh, 1);
        list_add_tail(&gh->gh_list, insert_pt);
 do_cancel:
        gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1017,6 +1108,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
                        fast_path = 1;
        }
+        trace_gfs2_glock_queue(gh, 0);
        spin_unlock(&gl->gl_spin);
        if (likely(fast_path))
                return;
@@ -1249,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
                gfs2_glock_put(gl);
 }
-/**
- * demote_ok - Check to see if it's ok to unlock a glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-static int demote_ok(const struct gfs2_glock *gl)
-{
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        if (gl->gl_state == LM_ST_UNLOCKED)
-                return 0;
-        if (!list_empty(&gl->gl_holders))
-                return 0;
-        if (glops->go_demote_ok)
-                return glops->go_demote_ok(gl);
-        return 1;
-}
 static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
        int may_demote;
        int nr_skipped = 0;
-        int got_ref = 0;
        LIST_HEAD(skipped);
        if (nr == 0)
@@ -1290,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
+                /* Check if glock is about to be freed */
+                if (atomic_read(&gl->gl_ref) == 0)
+                        continue;
                /* Test for being demotable */
                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                        gfs2_glock_hold(gl);
-                        got_ref = 1;
                        spin_unlock(&lru_lock);
                        spin_lock(&gl->gl_spin);
                        may_demote = demote_ok(gl);
-                        spin_unlock(&gl->gl_spin);
-                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        if (may_demote) {
                                handle_callback(gl, LM_ST_UNLOCKED, 0);
                                nr--;
-                                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-                                        gfs2_glock_put(gl);
-                                got_ref = 0;
                        }
+                        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                                gfs2_glock_put_nolock(gl);
+                        spin_unlock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        spin_lock(&lru_lock);
-                        if (may_demote)
+                        continue;
-                                continue;
-                }
-                if (list_empty(&gl->gl_lru) &&
-                    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
-                        nr_skipped++;
-                        list_add(&gl->gl_lru, &skipped);
-                }
-                if (got_ref) {
-                        spin_unlock(&lru_lock);
-                        gfs2_glock_put(gl);
-                        spin_lock(&lru_lock);
-                        got_ref = 0;
                }
+                nr_skipped++;
+                list_add(&gl->gl_lru, &skipped);
        }
        list_splice(&skipped, &lru_list);
        atomic_add(nr_skipped, &lru_count);
@@ -1702,6 +1765,11 @@ int __init gfs2_glock_init(void)
        glock_workqueue = create_workqueue("glock_workqueue");
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
+        gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+        if (IS_ERR(gfs2_delete_workqueue)) {
+                destroy_workqueue(glock_workqueue);
+                return PTR_ERR(gfs2_delete_workqueue);
+        }
        register_shrinker(&glock_shrinker);
@@ -1712,6 +1780,7 @@ void gfs2_glock_exit(void)
 {
        unregister_shrinker(&glock_shrinker);
        destroy_workqueue(glock_workqueue);
+        destroy_workqueue(gfs2_delete_workqueue);
 }
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a602a28f6f08..c609894ec0d0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -143,6 +143,7 @@ struct lm_lockops {
 #define GLR_TRYFAILED           13
+extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
        struct gfs2_holder *gh;
@@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 int gfs2_glock_get(struct gfs2_sbd *sdp,
                   u64 number, const struct gfs2_glock_operations *glops,
                   int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put_nolock(struct gfs2_glock *gl);
 int gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 70f87f43afa2..6985eef06c39 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -310,24 +310,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 /**
- * rgrp_go_dump - print out an rgrp
- * @seq: The iterator
- * @gl: The glock in question
- *
- */
-static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
-{
-        const struct gfs2_rgrpd *rgd = gl->gl_object;
-        if (rgd == NULL)
-                return 0;
-        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
-                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
-        return 0;
-}
-/**
 * trans_go_sync - promote/demote the transaction glock
 * @gl: the glock
 * @state: the requested state
@@ -341,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
@@ -390,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
        return 0;
 }
+/**
+ * iopen_go_callback - schedule the dcache entry for the inode to be deleted
+ * @gl: the glock
+ *
+ * gl_spin lock is held while calling this
+ */
+static void iopen_go_callback(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+        if (gl->gl_demote_state == LM_ST_UNLOCKED &&
+            gl->gl_state == LM_ST_SHARED &&
+            ip && test_bit(GIF_USER, &ip->i_flags)) {
+                gfs2_glock_hold(gl);
+                if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+                        gfs2_glock_put_nolock(gl);
+        }
+}
 const struct gfs2_glock_operations gfs2_meta_glops = {
        .go_type = LM_TYPE_META,
 };
@@ -410,7 +412,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
-        .go_dump = rgrp_go_dump,
+        .go_dump = gfs2_rgrp_dump,
        .go_type = LM_TYPE_RGRP,
        .go_min_hold_time = HZ / 5,
 };
@@ -424,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
 const struct gfs2_glock_operations gfs2_iopen_glops = {
        .go_type = LM_TYPE_IOPEN,
+        .go_callback = iopen_go_callback,
 };
 const struct gfs2_glock_operations gfs2_flock_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 399d1b978049..61801ada36f0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/slow-work.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
@@ -63,9 +64,12 @@ struct gfs2_log_element {
        const struct gfs2_log_operations *le_ops;
 };
+#define GBF_FULL 1
 struct gfs2_bitmap {
        struct buffer_head *bi_bh;
        char *bi_clone;
+        unsigned long bi_flags;
        u32 bi_offset;
        u32 bi_start;
        u32 bi_len;
@@ -90,10 +94,11 @@ struct gfs2_rgrpd {
        struct gfs2_sbd *rd_sbd;
        unsigned int rd_bh_count;
        u32 rd_last_alloc;
-        unsigned char rd_flags;
+        u32 rd_flags;
-#define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
+#define GFS2_RDF_CHECK          0x10000000 /* check for unlinked inodes */
-#define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
+#define GFS2_RDF_UPTODATE       0x20000000 /* rg is up to date */
-#define GFS2_RDF_UPTODATE     0x04      /* rg is up to date */
+#define GFS2_RDF_ERROR          0x40000000 /* error in rg */
+#define GFS2_RDF_MASK           0xf0000000 /* mask for internal flags */
 };
 enum gfs2_state_bits {
@@ -154,6 +159,7 @@ struct gfs2_glock_operations {
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+        void (*go_callback) (struct gfs2_glock *gl);
        const int go_type;
        const unsigned long go_min_hold_time;
 };
@@ -223,6 +229,7 @@ struct gfs2_glock {
        struct list_head gl_ail_list;
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
+        struct work_struct gl_delete;
 };
 #define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
@@ -376,11 +383,11 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
        struct list_head jd_list;
        struct list_head extent_list;
+        struct slow_work jd_work;
        struct inode *jd_inode;
+        unsigned long jd_flags;
+#define JDF_RECOVERY 1
        unsigned int jd_jid;
-        int jd_dirty;
        unsigned int jd_blocks;
 };
@@ -390,9 +397,6 @@ struct gfs2_statfs_change_host {
        s64 sc_dinodes;
 };
-#define GFS2_GLOCKD_DEFAULT     1
-#define GFS2_GLOCKD_MAX         16
 #define GFS2_QUOTA_DEFAULT      GFS2_QUOTA_OFF
 #define GFS2_QUOTA_OFF          0
 #define GFS2_QUOTA_ACCOUNT      1
@@ -418,6 +422,7 @@ struct gfs2_args {
        unsigned int ar_data:2;                 /* ordered/writeback */
        unsigned int ar_meta:1;                 /* mount metafs */
        unsigned int ar_discard:1;              /* discard requests */
+        int ar_commit;                          /* Commit interval */
 };
 struct gfs2_tune {
@@ -426,7 +431,6 @@ struct gfs2_tune {
        unsigned int gt_incore_log_blocks;
        unsigned int gt_log_flush_secs;
-        unsigned int gt_recoverd_secs;
        unsigned int gt_logd_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -447,6 +451,7 @@ enum {
        SDF_JOURNAL_LIVE        = 1,
        SDF_SHUTDOWN            = 2,
        SDF_NOBARRIERS          = 3,
+        SDF_NORECOVERY          = 4,
 };
 #define GFS2_FSNAME_LEN         256
@@ -493,7 +498,6 @@ struct lm_lockstruct {
        unsigned long ls_flags;
        dlm_lockspace_t *ls_dlm;
-        int ls_recover_jid;
        int ls_recover_jid_done;
        int ls_recover_jid_status;
 };
@@ -582,7 +586,6 @@ struct gfs2_sbd {
        /* Daemon stuff */
-        struct task_struct *sd_recoverd_process;
        struct task_struct *sd_logd_process;
        struct task_struct *sd_quotad_process;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5a31d426116f..2f94bd723698 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -30,7 +30,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
@@ -1047,154 +1046,7 @@ fail:
        return ERR_PTR(error);
 }
-/**
+static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
- * gfs2_rmdiri - Remove a directory
- * @dip: The parent directory of the directory to be removed
- * @name: The name of the directory to be removed
- * @ip: The GFS2 inode of the directory to be removed
- *
- * Assumes Glocks on dip and ip are held
- *
- * Returns: errno
- */
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-                struct gfs2_inode *ip)
-{
-        struct qstr dotname;
-        int error;
-        if (ip->i_entries != 2) {
-                if (gfs2_consist_inode(ip))
-                        gfs2_dinode_print(ip);
-                return -EIO;
-        }
-        error = gfs2_dir_del(dip, name);
-        if (error)
-                return error;
-        error = gfs2_change_nlink(dip, -1);
-        if (error)
-                return error;
-        gfs2_str2qstr(&dotname, ".");
-        error = gfs2_dir_del(ip, &dotname);
-        if (error)
-                return error;
-        gfs2_str2qstr(&dotname, "..");
-        error = gfs2_dir_del(ip, &dotname);
-        if (error)
-                return error;
-        /* It looks odd, but it really should be done twice */
-        error = gfs2_change_nlink(ip, -1);
-        if (error)
-                return error;
-        error = gfs2_change_nlink(ip, -1);
-        if (error)
-                return error;
-        return error;
-}
-/*
- * gfs2_unlink_ok - check to see that a inode is still in a directory
- * @dip: the directory
- * @name: the name of the file
- * @ip: the inode
- *
- * Assumes that the lock on (at least) @dip is held.
- *
- * Returns: 0 if the parent/child relationship is correct, errno if it isn't
- */
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-                   const struct gfs2_inode *ip)
-{
-        int error;
-        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-                return -EPERM;
-        if ((dip->i_inode.i_mode & S_ISVTX) &&
-            dip->i_inode.i_uid != current_fsuid() &&
-            ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
-                return -EPERM;
-        if (IS_APPEND(&dip->i_inode))
-                return -EPERM;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
-        if (error)
-                return error;
-        error = gfs2_dir_check(&dip->i_inode, name, ip);
-        if (error)
-                return error;
-        return 0;
-}
-/**
- * gfs2_readlinki - return the contents of a symlink
- * @ip: the symlink's inode
- * @buf: a pointer to the buffer to be filled
- * @len: a pointer to the length of @buf
- *
- * If @buf is too small, a piece of memory is kmalloc()ed and needs
- * to be freed by the caller.
- *
- * Returns: errno
- */
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
-{
-        struct gfs2_holder i_gh;
-        struct buffer_head *dibh;
-        unsigned int x;
-        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-        error = gfs2_glock_nq(&i_gh);
-        if (error) {
-                gfs2_holder_uninit(&i_gh);
-                return error;
-        }
-        if (!ip->i_disksize) {
-                gfs2_consist_inode(ip);
-                error = -EIO;
-                goto out;
-        }
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                goto out;
-        x = ip->i_disksize + 1;
-        if (x > *len) {
-                *buf = kmalloc(x, GFP_NOFS);
-                if (!*buf) {
-                        error = -ENOMEM;
-                        goto out_brelse;
-                }
-        }
-        memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-        *len = x;
-out_brelse:
-        brelse(dibh);
-out:
-        gfs2_glock_dq_uninit(&i_gh);
-        return error;
-}
-static int
-__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
        struct buffer_head *dibh;
        int error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c30be2b66580..c341aaf67adb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -11,8 +11,16 @@
 #define __INODE_DOT_H__
 #include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
 #include "util.h"
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+                              struct file_ra_state *ra_state,
+                              char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
        return !ip->i_height;
@@ -73,30 +81,26 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
-void gfs2_set_iop(struct inode *inode);
+extern void gfs2_set_iop(struct inode *inode);
-struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
+extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                u64 no_addr, u64 no_formal_ino,
+                                       u64 no_addr, u64 no_formal_ino,
-                                int skip_freeing);
+                                       int skip_freeing);
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
-int gfs2_inode_refresh(struct gfs2_inode *ip);
+extern int gfs2_inode_refresh(struct gfs2_inode *ip);
-int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
-struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-                           int is_root);
+                                  int is_root);
-struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
-                           unsigned int mode, dev_t dev);
+                                  const struct qstr *name,
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                                  unsigned int mode, dev_t dev);
-                struct gfs2_inode *ip);
+extern int gfs2_permission(struct inode *inode, int mask);
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-                   const struct gfs2_inode *ip);
+extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-int gfs2_permission(struct inode *inode, int mask);
+extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+extern void gfs2_dinode_print(const struct gfs2_inode *ip);
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 98918a756410..13c6237c5f67 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -28,6 +28,7 @@
 #include "meta_io.h"
 #include "util.h"
 #include "dir.h"
+#include "trace_gfs2.h"
 #define PULL 1
@@ -120,7 +121,7 @@ __acquires(&sdp->sd_log_lock)
                        lock_buffer(bh);
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
-                                submit_bh(WRITE, bh);
+                                submit_bh(WRITE_SYNC_PLUG, bh);
                        } else {
                                unlock_buffer(bh);
                                brelse(bh);
@@ -313,6 +314,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
                gfs2_log_lock(sdp);
        }
        atomic_sub(blks, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, -blks);
        gfs2_log_unlock(sdp);
        mutex_unlock(&sdp->sd_log_reserve_mutex);
@@ -333,6 +335,7 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
        gfs2_log_lock(sdp);
        atomic_add(blks, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, blks);
        gfs2_assert_withdraw(sdp,
                             atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
        gfs2_log_unlock(sdp);
@@ -558,6 +561,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
        gfs2_log_lock(sdp);
        atomic_add(dist, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, dist);
        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
        gfs2_log_unlock(sdp);
@@ -604,7 +608,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
                goto skip_barrier;
        get_bh(bh);
-        submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+        submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
        wait_on_buffer(bh);
        if (buffer_eopnotsupp(bh)) {
                clear_buffer_eopnotsupp(bh);
@@ -664,7 +668,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
                lock_buffer(bh);
                if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
                        bh->b_end_io = end_buffer_write_sync;
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE_SYNC_PLUG, bh);
                } else {
                        unlock_buffer(bh);
                        brelse(bh);
@@ -715,6 +719,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
                up_write(&sdp->sd_log_flush_lock);
                return;
        }
+        trace_gfs2_log_flush(sdp, 1);
        ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
        INIT_LIST_HEAD(&ai->ai_ail1_list);
@@ -746,6 +751,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
                gfs2_log_lock(sdp);
                atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
+                trace_gfs2_log_blocks(sdp, -1);
                gfs2_log_unlock(sdp);
                log_write_header(sdp, 0, PULL);
        }
@@ -763,8 +769,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
                ai = NULL;
        }
        gfs2_log_unlock(sdp);
+        trace_gfs2_log_flush(sdp, 0);
-        sdp->sd_vfs->s_dirt = 0;
        up_write(&sdp->sd_log_flush_lock);
        kfree(ai);
@@ -788,6 +793,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
        unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
        atomic_add(unused, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, unused);
        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
                             sdp->sd_jdesc->jd_blocks);
        sdp->sd_log_blks_reserved = reserved;
@@ -823,7 +829,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        log_refund(sdp, tr);
        buf_lo_incore_commit(sdp, tr);
-        sdp->sd_vfs->s_dirt = 1;
        up_read(&sdp->sd_log_flush_lock);
        gfs2_log_lock(sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 80e4f5f898bb..9969ff062c5b 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,6 +13,8 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -25,6 +27,7 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "trace_gfs2.h"
 /**
 * gfs2_pin - Pin a buffer in memory
@@ -51,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
        get_bh(bh);
+        trace_gfs2_pin(bd, 1);
 }
 /**
@@ -87,6 +91,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        bd->bd_ail = ai;
        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
        clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+        trace_gfs2_pin(bd, 0);
        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
 }
@@ -189,7 +194,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                }
                gfs2_log_unlock(sdp);
-                submit_bh(WRITE, bh);
+                submit_bh(WRITE_SYNC_PLUG, bh);
                gfs2_log_lock(sdp);
                n = 0;
@@ -199,7 +204,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                        gfs2_log_unlock(sdp);
                        lock_buffer(bd2->bd_bh);
                        bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE_SYNC_PLUG, bh);
                        gfs2_log_lock(sdp);
                        if (++n >= num)
                                break;
@@ -341,7 +346,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
                sdp->sd_log_num_revoke--;
                if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE_SYNC_PLUG, bh);
                        bh = gfs2_log_get_buf(sdp);
                        mh = (struct gfs2_meta_header *)bh->b_data;
@@ -358,7 +363,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        }
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
-        submit_bh(WRITE, bh);
+        submit_bh(WRITE_SYNC_PLUG, bh);
 }
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -560,7 +565,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
        ptr = bh_log_ptr(bh);
        
        get_bh(bh);
-        submit_bh(WRITE, bh);
+        submit_bh(WRITE_SYNC_PLUG, bh);
        gfs2_log_lock(sdp);
        while(!list_empty(list)) {
                bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -586,7 +591,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
                } else {
                        bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
                }
-                submit_bh(WRITE, bh1);
+                submit_bh(WRITE_SYNC_PLUG, bh1);
                gfs2_log_lock(sdp);
                ptr += 2;
        }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a6892ed0840a..eacd78a5d082 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
 #include <asm/atomic.h>
+#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void)
        if (error)
                goto fail_unregister;
+        error = slow_work_register_user();
+        if (error)
+                goto fail_slow;
        gfs2_register_debugfs();
        printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
        return 0;
+fail_slow:
+        unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
        unregister_filesystem(&gfs2_fs_type);
 fail:
@@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void)
        gfs2_unregister_debugfs();
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
+        slow_work_unregister_user();
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8d6f13256b26..cb8d7a93d5ec 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,19 +31,66 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "ops_address.h"
-static int aspace_get_block(struct inode *inode, sector_t lblock,
+static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
-                            struct buffer_head *bh_result, int create)
 {
-        gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
+        int err;
-        return -EOPNOTSUPP;
+        struct buffer_head *bh, *head;
-}
+        int nr_underway = 0;
+        int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
+                        WRITE_SYNC_PLUG : WRITE));
+        BUG_ON(!PageLocked(page));
+        BUG_ON(!page_has_buffers(page));
+        head = page_buffers(page);
+        bh = head;
+        do {
+                if (!buffer_mapped(bh))
+                        continue;
+                /*
+                 * If it's a fully non-blocking write attempt and we cannot
+                 * lock the buffer then redirty the page.  Note that this can
+                 * potentially cause a busy-wait loop from pdflush and kswapd
+                 * activity, but those code paths have their own higher-level
+                 * throttling.
+                 */
+                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                        lock_buffer(bh);
+                } else if (!trylock_buffer(bh)) {
+                        redirty_page_for_writepage(wbc, page);
+                        continue;
+                }
+                if (test_clear_buffer_dirty(bh)) {
+                        mark_buffer_async_write(bh);
+                } else {
+                        unlock_buffer(bh);
+                }
+        } while ((bh = bh->b_this_page) != head);
+        /*
+         * The page and its buffers are protected by PageWriteback(), so we can
+         * drop the bh refcounts early.
+         */
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        do {
+                struct buffer_head *next = bh->b_this_page;
+                if (buffer_async_write(bh)) {
+                        submit_bh(write_op, bh);
+                        nr_underway++;
+                }
+                bh = next;
+        } while (bh != head);
+        unlock_page(page);
-static int gfs2_aspace_writepage(struct page *page,
+        err = 0;
-                                 struct writeback_control *wbc)
+        if (nr_underway == 0)
-{
+                end_page_writeback(page);
-        return block_write_full_page(page, aspace_get_block, wbc);
+        return err;
 }
 static const struct address_space_operations aspace_aops = {
@@ -201,16 +248,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
                   struct buffer_head **bhp)
 {
-        *bhp = gfs2_getbuf(gl, blkno, CREATE);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
-        if (!buffer_uptodate(*bhp)) {
+        struct buffer_head *bh;
-                ll_rw_block(READ_META, 1, bhp);
-                if (flags & DIO_WAIT) {
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                        int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+                return -EIO;
-                        if (error) {
-                                brelse(*bhp);
+        *bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
-                                return error;
-                        }
+        lock_buffer(bh);
-                }
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                return 0;
+        }
+        bh->b_end_io = end_buffer_read_sync;
+        get_bh(bh);
+        submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
+        if (!(flags & DIO_WAIT))
+                return 0;
+        wait_on_buffer(bh);
+        if (unlikely(!buffer_uptodate(bh))) {
+                struct gfs2_trans *tr = current->journal_info;
+                if (tr && tr->tr_touched)
+                        gfs2_io_error_bh(sdp, bh);
+                brelse(bh);
+                return -EIO;
        }
        return 0;
@@ -404,7 +467,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        if (buffer_uptodate(first_bh))
                goto out;
        if (!buffer_locked(first_bh))
-                ll_rw_block(READ_META, 1, &first_bh);
+                ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
        dblock++;
        extlen--;
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
deleted file mode 100644
index f7e8527a21e0..000000000000
--- a/fs/gfs2/mount.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/parser.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-enum {
-        Opt_lockproto,
-        Opt_locktable,
-        Opt_hostdata,
-        Opt_spectator,
-        Opt_ignore_local_fs,
-        Opt_localflocks,
-        Opt_localcaching,
-        Opt_debug,
-        Opt_nodebug,
-        Opt_upgrade,
-        Opt_acl,
-        Opt_noacl,
-        Opt_quota_off,
-        Opt_quota_account,
-        Opt_quota_on,
-        Opt_quota,
-        Opt_noquota,
-        Opt_suiddir,
-        Opt_nosuiddir,
-        Opt_data_writeback,
-        Opt_data_ordered,
-        Opt_meta,
-        Opt_discard,
-        Opt_nodiscard,
-        Opt_err,
-};
-static const match_table_t tokens = {
-        {Opt_lockproto, "lockproto=%s"},
-        {Opt_locktable, "locktable=%s"},
-        {Opt_hostdata, "hostdata=%s"},
-        {Opt_spectator, "spectator"},
-        {Opt_ignore_local_fs, "ignore_local_fs"},
-        {Opt_localflocks, "localflocks"},
-        {Opt_localcaching, "localcaching"},
-        {Opt_debug, "debug"},
-        {Opt_nodebug, "nodebug"},
-        {Opt_upgrade, "upgrade"},
-        {Opt_acl, "acl"},
-        {Opt_noacl, "noacl"},
-        {Opt_quota_off, "quota=off"},
-        {Opt_quota_account, "quota=account"},
-        {Opt_quota_on, "quota=on"},
-        {Opt_quota, "quota"},
-        {Opt_noquota, "noquota"},
-        {Opt_suiddir, "suiddir"},
-        {Opt_nosuiddir, "nosuiddir"},
-        {Opt_data_writeback, "data=writeback"},
-        {Opt_data_ordered, "data=ordered"},
-        {Opt_meta, "meta"},
-        {Opt_discard, "discard"},
-        {Opt_nodiscard, "nodiscard"},
-        {Opt_err, NULL}
-};
-/**
- * gfs2_mount_args - Parse mount options
- * @sdp:
- * @data:
- *
- * Return: errno
- */
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
-{
-        char *o;
-        int token;
-        substring_t tmp[MAX_OPT_ARGS];
-        /* Split the options into tokens with the "," character and
-           process them */
-        while (1) {
-                o = strsep(&options, ",");
-                if (o == NULL)
-                        break;
-                if (*o == '\0')
-                        continue;
-                token = match_token(o, tokens, tmp);
-                switch (token) {
-                case Opt_lockproto:
-                        match_strlcpy(args->ar_lockproto, &tmp[0],
-                                      GFS2_LOCKNAME_LEN);
-                        break;
-                case Opt_locktable:
-                        match_strlcpy(args->ar_locktable, &tmp[0],
-                                      GFS2_LOCKNAME_LEN);
-                        break;
-                case Opt_hostdata:
-                        match_strlcpy(args->ar_hostdata, &tmp[0],
-                                      GFS2_LOCKNAME_LEN);
-                        break;
-                case Opt_spectator:
-                        args->ar_spectator = 1;
-                        break;
-                case Opt_ignore_local_fs:
-                        args->ar_ignore_local_fs = 1;
-                        break;
-                case Opt_localflocks:
-                        args->ar_localflocks = 1;
-                        break;
-                case Opt_localcaching:
-                        args->ar_localcaching = 1;
-                        break;
-                case Opt_debug:
-                        args->ar_debug = 1;
-                        break;
-                case Opt_nodebug:
-                        args->ar_debug = 0;
-                        break;
-                case Opt_upgrade:
-                        args->ar_upgrade = 1;
-                        break;
-                case Opt_acl:
-                        args->ar_posix_acl = 1;
-                        break;
-                case Opt_noacl:
-                        args->ar_posix_acl = 0;
-                        break;
-                case Opt_quota_off:
-                case Opt_noquota:
-                        args->ar_quota = GFS2_QUOTA_OFF;
-                        break;
-                case Opt_quota_account:
-                        args->ar_quota = GFS2_QUOTA_ACCOUNT;
-                        break;
-                case Opt_quota_on:
-                case Opt_quota:
-                        args->ar_quota = GFS2_QUOTA_ON;
-                        break;
-                case Opt_suiddir:
-                        args->ar_suiddir = 1;
-                        break;
-                case Opt_nosuiddir:
-                        args->ar_suiddir = 0;
-                        break;
-                case Opt_data_writeback:
-                        args->ar_data = GFS2_DATA_WRITEBACK;
-                        break;
-                case Opt_data_ordered:
-                        args->ar_data = GFS2_DATA_ORDERED;
-                        break;
-                case Opt_meta:
-                        args->ar_meta = 1;
-                        break;
-                case Opt_discard:
-                        args->ar_discard = 1;
-                        break;
-                case Opt_nodiscard:
-                        args->ar_discard = 0;
-                        break;
-                case Opt_err:
-                default:
-                        fs_info(sdp, "invalid mount option: %s\n", o);
-                        return -EINVAL;
-                }
-        }
-        return 0;
-}
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
deleted file mode 100644
index 5da21285bba4..000000000000
--- a/fs/gfs2/ops_address.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_ADDRESS_DOT_H__
-#define __OPS_ADDRESS_DOT_H__
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/mm.h>
-extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
-extern int gfs2_internal_read(struct gfs2_inode *ip,
-                              struct file_ra_state *ra_state,
-                              char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_aops(struct inode *inode);
-#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea753..7bc3c45cd676 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,6 +17,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -32,6 +33,7 @@
 #include "log.h"
 #include "quota.h"
 #include "dir.h"
+#include "trace_gfs2.h"
 #define DO 0
 #define UNDO 1
@@ -55,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
        spin_lock_init(&gt->gt_spin);
        gt->gt_incore_log_blocks = 1024;
-        gt->gt_log_flush_secs = 60;
-        gt->gt_recoverd_secs = 60;
        gt->gt_logd_secs = 1;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
@@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
        }
        /* Set up the buffer cache and SB for real */
-        if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+        if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
                ret = -EINVAL;
                fs_err(sdp, "FS block size (%u) is too small for device "
                       "block size (%u)\n",
-                       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+                       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
                goto out;
        }
        if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
@@ -676,6 +676,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
                        break;
                INIT_LIST_HEAD(&jd->extent_list);
+                slow_work_init(&jd->jd_work, &gfs2_recover_ops);
                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
                        if (!jd->jd_inode)
@@ -701,14 +702,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
        struct inode *master = sdp->sd_master_dir->d_inode;
        struct gfs2_holder ji_gh;
-        struct task_struct *p;
        struct gfs2_inode *ip;
        int jindex = 1;
        int error = 0;
        if (undo) {
                jindex = 0;
-                goto fail_recoverd;
+                goto fail_jinode_gh;
        }
        sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
@@ -776,6 +776,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                /* Map the extents for this journal's blocks */
                map_journal_extents(sdp);
        }
+        trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
        if (sdp->sd_lockstruct.ls_first) {
                unsigned int x;
@@ -801,18 +802,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        gfs2_glock_dq_uninit(&ji_gh);
        jindex = 0;
-        p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
-        error = IS_ERR(p);
-        if (error) {
-                fs_err(sdp, "can't start recoverd thread: %d\n", error);
-                goto fail_jinode_gh;
-        }
-        sdp->sd_recoverd_process = p;
        return 0;
-fail_recoverd:
-        kthread_stop(sdp->sd_recoverd_process);
 fail_jinode_gh:
        if (!sdp->sd_args.ar_spectator)
                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
@@ -1165,6 +1156,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
        sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
+        sdp->sd_args.ar_commit = 60;
        error = gfs2_mount_args(sdp, &sdp->sd_args, data);
        if (error) {
@@ -1172,8 +1164,10 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                goto fail;
        }
-        if (sdp->sd_args.ar_spectator)
+        if (sdp->sd_args.ar_spectator) {
                sb->s_flags |= MS_RDONLY;
+                set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+        }
        if (sdp->sd_args.ar_posix_acl)
                sb->s_flags |= MS_POSIXACL;
@@ -1191,6 +1185,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                               GFS2_BASIC_BLOCK_SHIFT;
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
        error = init_names(sdp, silent);
        if (error)
                goto fail;
@@ -1279,9 +1275,22 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
 }
-static struct super_block *get_gfs2_sb(const char *dev_name)
+static int test_meta_super(struct super_block *s, void *ptr)
+{
+        struct block_device *bdev = ptr;
+        return (bdev == s->s_bdev);
+}
+static int set_meta_super(struct super_block *s, void *ptr)
 {
-        struct super_block *sb;
+        return -EINVAL;
+}
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+                            const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct super_block *s;
+        struct gfs2_sbd *sdp;
        struct path path;
        int error;
@@ -1289,30 +1298,17 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
        if (error) {
                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
                       dev_name, error);
-                return NULL;
+                return error;
        }
-        sb = path.dentry->d_inode->i_sb;
+        s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
-        if (sb && (sb->s_type == &gfs2_fs_type))
+                 path.dentry->d_inode->i_sb->s_bdev);
-                atomic_inc(&sb->s_active);
-        else
-                sb = NULL;
        path_put(&path);
-        return sb;
+        if (IS_ERR(s)) {
-}
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
-                            const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        struct super_block *sb = NULL;
-        struct gfs2_sbd *sdp;
-        sb = get_gfs2_sb(dev_name);
-        if (!sb) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-                return -ENOENT;
+                return PTR_ERR(s);
        }
-        sdp = sb->s_fs_info;
+        sdp = s->s_fs_info;
-        mnt->mnt_sb = sb;
+        mnt->mnt_sb = s;
        mnt->mnt_root = dget(sdp->sd_master_dir);
        return 0;
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1c70fa5168d6..f8bd20baf99c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -262,6 +262,44 @@ out_parent:
        return error;
 }
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+                          const struct gfs2_inode *ip)
+{
+        int error;
+        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+                return -EPERM;
+        if ((dip->i_inode.i_mode & S_ISVTX) &&
+            dip->i_inode.i_uid != current_fsuid() &&
+            ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (IS_APPEND(&dip->i_inode))
+                return -EPERM;
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        if (error)
+                return error;
+        error = gfs2_dir_check(&dip->i_inode, name, ip);
+        if (error)
+                return error;
+        return 0;
+}
 /**
 * gfs2_unlink - Unlink a file
 * @dir: The inode of the directory containing the file to unlink
@@ -473,6 +511,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 /**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                       struct gfs2_inode *ip)
+{
+        struct qstr dotname;
+        int error;
+        if (ip->i_entries != 2) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(ip);
+                return -EIO;
+        }
+        error = gfs2_dir_del(dip, name);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(dip, -1);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, ".");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, "..");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        /* It looks odd, but it really should be done twice */
+        error = gfs2_change_nlink(ip, -1);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(ip, -1);
+        if (error)
+                return error;
+        return error;
+}
+/**
 * gfs2_rmdir - Remove a directory
 * @dir: The parent directory of the directory to be removed
 * @dentry: The dentry of the directory to remove
@@ -885,6 +976,61 @@ out:
 }
 /**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+        struct gfs2_holder i_gh;
+        struct buffer_head *dibh;
+        unsigned int x;
+        int error;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+        error = gfs2_glock_nq(&i_gh);
+        if (error) {
+                gfs2_holder_uninit(&i_gh);
+                return error;
+        }
+        if (!ip->i_disksize) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+                goto out;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        x = ip->i_disksize + 1;
+        if (x > *len) {
+                *buf = kmalloc(x, GFP_NOFS);
+                if (!*buf) {
+                        error = -ENOMEM;
+                        goto out_brelse;
+                }
+        }
+        memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+        *len = x;
+out_brelse:
+        brelse(dibh);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
 * gfs2_readlink - Read the value of a symlink
 * @dentry: the symlink
 * @buf: the buffer to read the symlink data into
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
deleted file mode 100644
index 458019569dcb..000000000000
--- a/fs/gfs2/ops_super.c
+++ /dev/null
@@ -1,723 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/statfs.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-#include <linux/time.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "inode.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "rgrp.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-#include "trans.h"
-#include "dir.h"
-#include "eattr.h"
-#include "bmap.h"
-#include "meta_io.h"
-#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
-/**
- * gfs2_write_inode - Make sure the inode is stable on the disk
- * @inode: The inode
- * @sync: synchronous write flag
- *
- * Returns: errno
- */
-static int gfs2_write_inode(struct inode *inode, int sync)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct gfs2_holder gh;
-        struct buffer_head *bh;
-        struct timespec atime;
-        struct gfs2_dinode *di;
-        int ret = 0;
-        /* Check this is a "normal" inode, etc */
-        if (!test_bit(GIF_USER, &ip->i_flags) ||
-            (current->flags & PF_MEMALLOC))
-                return 0;
-        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        if (ret)
-                goto do_flush;
-        ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-        if (ret)
-                goto do_unlock;
-        ret = gfs2_meta_inode_buffer(ip, &bh);
-        if (ret == 0) {
-                di = (struct gfs2_dinode *)bh->b_data;
-                atime.tv_sec = be64_to_cpu(di->di_atime);
-                atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
-                if (timespec_compare(&inode->i_atime, &atime) > 0) {
-                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                        gfs2_dinode_out(ip, bh->b_data);
-                }
-                brelse(bh);
-        }
-        gfs2_trans_end(sdp);
-do_unlock:
-        gfs2_glock_dq_uninit(&gh);
-do_flush:
-        if (sync != 0)
-                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
-        return ret;
-}
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-        struct gfs2_holder t_gh;
-        int error;
-        gfs2_quota_sync(sdp);
-        gfs2_statfs_sync(sdp);
-        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-                                   &t_gh);
-        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return error;
-        gfs2_meta_syncfs(sdp);
-        gfs2_log_shutdown(sdp);
-        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-        if (t_gh.gh_gl)
-                gfs2_glock_dq_uninit(&t_gh);
-        gfs2_quota_cleanup(sdp);
-        return error;
-}
-/**
- * gfs2_put_super - Unmount the filesystem
- * @sb: The VFS superblock
- *
- */
-static void gfs2_put_super(struct super_block *sb)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        int error;
-        /*  Unfreeze the filesystem, if we need to  */
-        mutex_lock(&sdp->sd_freeze_lock);
-        if (sdp->sd_freeze_count)
-                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-        mutex_unlock(&sdp->sd_freeze_lock);
-        kthread_stop(sdp->sd_quotad_process);
-        kthread_stop(sdp->sd_logd_process);
-        kthread_stop(sdp->sd_recoverd_process);
-        if (!(sb->s_flags & MS_RDONLY)) {
-                error = gfs2_make_fs_ro(sdp);
-                if (error)
-                        gfs2_io_error(sdp);
-        }
-        /*  At this point, we're through modifying the disk  */
-        /*  Release stuff  */
-        iput(sdp->sd_jindex);
-        iput(sdp->sd_inum_inode);
-        iput(sdp->sd_statfs_inode);
-        iput(sdp->sd_rindex);
-        iput(sdp->sd_quota_inode);
-        gfs2_glock_put(sdp->sd_rename_gl);
-        gfs2_glock_put(sdp->sd_trans_gl);
-        if (!sdp->sd_args.ar_spectator) {
-                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-                iput(sdp->sd_ir_inode);
-                iput(sdp->sd_sc_inode);
-                iput(sdp->sd_qc_inode);
-        }
-        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
-        gfs2_clear_rgrpd(sdp);
-        gfs2_jindex_free(sdp);
-        /*  Take apart glock structures and buffer lists  */
-        gfs2_gl_hash_clear(sdp);
-        /*  Unmount the locking protocol  */
-        gfs2_lm_unmount(sdp);
-        /*  At this point, we're through participating in the lockspace  */
-        gfs2_sys_fs_del(sdp);
-}
-/**
- * gfs2_write_super
- * @sb: the superblock
- *
- */
-static void gfs2_write_super(struct super_block *sb)
-{
-        sb->s_dirt = 0;
-}
-/**
- * gfs2_sync_fs - sync the filesystem
- * @sb: the superblock
- *
- * Flushes the log to disk.
- */
-static int gfs2_sync_fs(struct super_block *sb, int wait)
-{
-        sb->s_dirt = 0;
-        if (wait && sb->s_fs_info)
-                gfs2_log_flush(sb->s_fs_info, NULL);
-        return 0;
-}
-/**
- * gfs2_freeze - prevent further writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-static int gfs2_freeze(struct super_block *sb)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        int error;
-        if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return -EINVAL;
-        for (;;) {
-                error = gfs2_freeze_fs(sdp);
-                if (!error)
-                        break;
-                switch (error) {
-                case -EBUSY:
-                        fs_err(sdp, "waiting for recovery before freeze\n");
-                        break;
-                default:
-                        fs_err(sdp, "error freezing FS: %d\n", error);
-                        break;
-                }
-                fs_err(sdp, "retrying...\n");
-                msleep(1000);
-        }
-        return 0;
-}
-/**
- * gfs2_unfreeze - reallow writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-static int gfs2_unfreeze(struct super_block *sb)
-{
-        gfs2_unfreeze_fs(sb->s_fs_info);
-        return 0;
-}
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-                            struct gfs2_statfs_change_host *sc)
-{
-        gfs2_rgrp_verify(rgd);
-        sc->sc_total += rgd->rd_data;
-        sc->sc_free += rgd->rd_free;
-        sc->sc_dinodes += rgd->rd_dinodes;
-        return 0;
-}
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_holder ri_gh;
-        struct gfs2_rgrpd *rgd_next;
-        struct gfs2_holder *gha, *gh;
-        unsigned int slots = 64;
-        unsigned int x;
-        int done;
-        int error = 0, err;
-        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-        if (!gha)
-                return -ENOMEM;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                goto out;
-        rgd_next = gfs2_rgrpd_get_first(sdp);
-        for (;;) {
-                done = 1;
-                for (x = 0; x < slots; x++) {
-                        gh = gha + x;
-                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
-                                err = gfs2_glock_wait(gh);
-                                if (err) {
-                                        gfs2_holder_uninit(gh);
-                                        error = err;
-                                } else {
-                                        if (!error)
-                                                error = statfs_slow_fill(
-                                                        gh->gh_gl->gl_object, sc);
-                                        gfs2_glock_dq_uninit(gh);
-                                }
-                        }
-                        if (gh->gh_gl)
-                                done = 0;
-                        else if (rgd_next && !error) {
-                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
-                                                           LM_ST_SHARED,
-                                                           GL_ASYNC,
-                                                           gh);
-                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
-                                done = 0;
-                        }
-                        if (signal_pending(current))
-                                error = -ERESTARTSYS;
-                }
-                if (done)
-                        break;
-                yield();
-        }
-        gfs2_glock_dq_uninit(&ri_gh);
-out:
-        kfree(gha);
-        return error;
-}
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-        spin_lock(&sdp->sd_statfs_spin);
-        *sc = *m_sc;
-        sc->sc_total += l_sc->sc_total;
-        sc->sc_free += l_sc->sc_free;
-        sc->sc_dinodes += l_sc->sc_dinodes;
-        spin_unlock(&sdp->sd_statfs_spin);
-        if (sc->sc_free < 0)
-                sc->sc_free = 0;
-        if (sc->sc_free > sc->sc_total)
-                sc->sc_free = sc->sc_total;
-        if (sc->sc_dinodes < 0)
-                sc->sc_dinodes = 0;
-        return 0;
-}
-/**
- * gfs2_statfs - Gather and return stats about the filesystem
- * @sb: The superblock
- * @statfsbuf: The buffer
- *
- * Returns: 0 on success or error code
- */
-static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        struct super_block *sb = dentry->d_inode->i_sb;
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_statfs_change_host sc;
-        int error;
-        if (gfs2_tune_get(sdp, gt_statfs_slow))
-                error = gfs2_statfs_slow(sdp, &sc);
-        else
-                error = gfs2_statfs_i(sdp, &sc);
-        if (error)
-                return error;
-        buf->f_type = GFS2_MAGIC;
-        buf->f_bsize = sdp->sd_sb.sb_bsize;
-        buf->f_blocks = sc.sc_total;
-        buf->f_bfree = sc.sc_free;
-        buf->f_bavail = sc.sc_free;
-        buf->f_files = sc.sc_dinodes + sc.sc_free;
-        buf->f_ffree = sc.sc_free;
-        buf->f_namelen = GFS2_FNAMESIZE;
-        return 0;
-}
-/**
- * gfs2_remount_fs - called when the FS is remounted
- * @sb:  the filesystem
- * @flags:  the remount flags
- * @data:  extra data passed in (not used right now)
- *
- * Returns: errno
- */
-static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_args args = sdp->sd_args; /* Default to current settings */
-        int error;
-        error = gfs2_mount_args(sdp, &args, data);
-        if (error)
-                return error;
-        /* Not allowed to change locking details */
-        if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
-            strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
-            strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
-                return -EINVAL;
-        /* Some flags must not be changed */
-        if (args_neq(&args, &sdp->sd_args, spectator) ||
-            args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
-            args_neq(&args, &sdp->sd_args, localflocks) ||
-            args_neq(&args, &sdp->sd_args, localcaching) ||
-            args_neq(&args, &sdp->sd_args, meta))
-                return -EINVAL;
-        if (sdp->sd_args.ar_spectator)
-                *flags |= MS_RDONLY;
-        if ((sb->s_flags ^ *flags) & MS_RDONLY) {
-                if (*flags & MS_RDONLY)
-                        error = gfs2_make_fs_ro(sdp);
-                else
-                        error = gfs2_make_fs_rw(sdp);
-                if (error)
-                        return error;
-        }
-        sdp->sd_args = args;
-        if (sdp->sd_args.ar_posix_acl)
-                sb->s_flags |= MS_POSIXACL;
-        else
-                sb->s_flags &= ~MS_POSIXACL;
-        return 0;
-}
-/**
- * gfs2_drop_inode - Drop an inode (test for remote unlink)
- * @inode: The inode to drop
- *
- * If we've received a callback on an iopen lock then its because a
- * remote node tried to deallocate the inode but failed due to this node
- * still having the inode open. Here we mark the link count zero
- * since we know that it must have reached zero if the GLF_DEMOTE flag
- * is set on the iopen glock. If we didn't do a disk read since the
- * remote node removed the final link then we might otherwise miss
- * this event. This check ensures that this node will deallocate the
- * inode's blocks, or alternatively pass the baton on to another
- * node for later deallocation.
- */
-static void gfs2_drop_inode(struct inode *inode)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
-                struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
-                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
-                        clear_nlink(inode);
-        }
-        generic_drop_inode(inode);
-}
-/**
- * gfs2_clear_inode - Deallocate an inode when VFS is done with it
- * @inode: The VFS inode
- *
- */
-static void gfs2_clear_inode(struct inode *inode)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        /* This tells us its a "real" inode and not one which only
-         * serves to contain an address space (see rgrp.c, meta_io.c)
-         * which therefore doesn't have its own glocks.
-         */
-        if (test_bit(GIF_USER, &ip->i_flags)) {
-                ip->i_gl->gl_object = NULL;
-                gfs2_glock_put(ip->i_gl);
-                ip->i_gl = NULL;
-                if (ip->i_iopen_gh.gh_gl) {
-                        ip->i_iopen_gh.gh_gl->gl_object = NULL;
-                        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-                }
-        }
-}
-static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
-{
-        do {
-                if (d1 == d2)
-                        return 1;
-                d1 = d1->d_parent;
-        } while (!IS_ROOT(d1));
-        return 0;
-}
-/**
- * gfs2_show_options - Show mount options for /proc/mounts
- * @s: seq_file structure
- * @mnt: vfsmount
- *
- * Returns: 0 on success or error code
- */
-static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
-{
-        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
-        struct gfs2_args *args = &sdp->sd_args;
-        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
-                seq_printf(s, ",meta");
-        if (args->ar_lockproto[0])
-                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
-        if (args->ar_locktable[0])
-                seq_printf(s, ",locktable=%s", args->ar_locktable);
-        if (args->ar_hostdata[0])
-                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
-        if (args->ar_spectator)
-                seq_printf(s, ",spectator");
-        if (args->ar_ignore_local_fs)
-                seq_printf(s, ",ignore_local_fs");
-        if (args->ar_localflocks)
-                seq_printf(s, ",localflocks");
-        if (args->ar_localcaching)
-                seq_printf(s, ",localcaching");
-        if (args->ar_debug)
-                seq_printf(s, ",debug");
-        if (args->ar_upgrade)
-                seq_printf(s, ",upgrade");
-        if (args->ar_posix_acl)
-                seq_printf(s, ",acl");
-        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
-                char *state;
-                switch (args->ar_quota) {
-                case GFS2_QUOTA_OFF:
-                        state = "off";
-                        break;
-                case GFS2_QUOTA_ACCOUNT:
-                        state = "account";
-                        break;
-                case GFS2_QUOTA_ON:
-                        state = "on";
-                        break;
-                default:
-                        state = "unknown";
-                        break;
-                }
-                seq_printf(s, ",quota=%s", state);
-        }
-        if (args->ar_suiddir)
-                seq_printf(s, ",suiddir");
-        if (args->ar_data != GFS2_DATA_DEFAULT) {
-                char *state;
-                switch (args->ar_data) {
-                case GFS2_DATA_WRITEBACK:
-                        state = "writeback";
-                        break;
-                case GFS2_DATA_ORDERED:
-                        state = "ordered";
-                        break;
-                default:
-                        state = "unknown";
-                        break;
-                }
-                seq_printf(s, ",data=%s", state);
-        }
-        if (args->ar_discard)
-                seq_printf(s, ",discard");
-        return 0;
-}
-/*
- * We have to (at the moment) hold the inodes main lock to cover
- * the gap between unlocking the shared lock on the iopen lock and
- * taking the exclusive lock. I'd rather do a shared -> exclusive
- * conversion on the iopen lock, but we can change that later. This
- * is safe, just less efficient.
- */
-static void gfs2_delete_inode(struct inode *inode)
-{
-        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder gh;
-        int error;
-        if (!test_bit(GIF_USER, &ip->i_flags))
-                goto out;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        if (unlikely(error)) {
-                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-                goto out;
-        }
-        gfs2_glock_dq_wait(&ip->i_iopen_gh);
-        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
-        error = gfs2_glock_nq(&ip->i_iopen_gh);
-        if (error)
-                goto out_truncate;
-        if (S_ISDIR(inode->i_mode) &&
-            (ip->i_diskflags & GFS2_DIF_EXHASH)) {
-                error = gfs2_dir_exhash_dealloc(ip);
-                if (error)
-                        goto out_unlock;
-        }
-        if (ip->i_eattr) {
-                error = gfs2_ea_dealloc(ip);
-                if (error)
-                        goto out_unlock;
-        }
-        if (!gfs2_is_stuffed(ip)) {
-                error = gfs2_file_dealloc(ip);
-                if (error)
-                        goto out_unlock;
-        }
-        error = gfs2_dinode_dealloc(ip);
-        if (error)
-                goto out_unlock;
-out_truncate:
-        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-        if (error)
-                goto out_unlock;
-        /* Needs to be done before glock release & also in a transaction */
-        truncate_inode_pages(&inode->i_data, 0);
-        gfs2_trans_end(sdp);
-out_unlock:
-        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
-                gfs2_glock_dq(&ip->i_iopen_gh);
-        gfs2_holder_uninit(&ip->i_iopen_gh);
-        gfs2_glock_dq_uninit(&gh);
-        if (error && error != GLR_TRYFAILED)
-                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
-out:
-        truncate_inode_pages(&inode->i_data, 0);
-        clear_inode(inode);
-}
-static struct inode *gfs2_alloc_inode(struct super_block *sb)
-{
-        struct gfs2_inode *ip;
-        ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
-        if (ip) {
-                ip->i_flags = 0;
-                ip->i_gl = NULL;
-        }
-        return &ip->i_inode;
-}
-static void gfs2_destroy_inode(struct inode *inode)
-{
-        kmem_cache_free(gfs2_inode_cachep, inode);
-}
-const struct super_operations gfs2_super_ops = {
-        .alloc_inode            = gfs2_alloc_inode,
-        .destroy_inode          = gfs2_destroy_inode,
-        .write_inode            = gfs2_write_inode,
-        .delete_inode           = gfs2_delete_inode,
-        .put_super              = gfs2_put_super,
-        .write_super            = gfs2_write_super,
-        .sync_fs                = gfs2_sync_fs,
-        .freeze_fs              = gfs2_freeze,
-        .unfreeze_fs            = gfs2_unfreeze,
-        .statfs                 = gfs2_statfs,
-        .remount_fs             = gfs2_remount_fs,
-        .clear_inode            = gfs2_clear_inode,
-        .drop_inode             = gfs2_drop_inode,
-        .show_options           = gfs2_show_options,
-};
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 152e6c4a0dca..2e9b9326bfc9 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -60,7 +60,6 @@
 #include "super.h"
 #include "trans.h"
 #include "inode.h"
-#include "ops_address.h"
 #include "util.h"
 #define QUOTA_USER 1
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 247e8f7d6b3d..59d2695509d3 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,8 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/kthread.h>
+#include <linux/slow-work.h>
-#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
-/**
+static int gfs2_recover_get_ref(struct slow_work *work)
- * gfs2_recover_journal - recover a given journal
+{
- * @jd: the struct gfs2_jdesc describing the journal
+        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
- *
+        if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
- * Acquire the journal's lock, check to see if the journal is clean, and
+                return -EBUSY;
- * do recovery if necessary.
+        return 0;
- *
+}
- * Returns: errno
- */
-int gfs2_recover_journal(struct gfs2_jdesc *jd)
+static void gfs2_recover_put_ref(struct slow_work *work)
+{
+        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+        clear_bit(JDF_RECOVERY, &jd->jd_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
+}
+static void gfs2_recover_work(struct slow_work *work)
 {
+        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
        struct gfs2_log_header_host head;
@@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
                gfs2_glock_dq_uninit(&j_gh);
        fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
-        return 0;
+        return;
 fail_gunlock_tr:
        gfs2_glock_dq_uninit(&t_gh);
@@ -584,70 +590,28 @@ fail_gunlock_j:
 fail:
        gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
-        return error;
 }
-static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+struct slow_work_ops gfs2_recover_ops = {
-{
+        .get_ref = gfs2_recover_get_ref,
-        struct gfs2_jdesc *jd;
+        .put_ref = gfs2_recover_put_ref,
-        int found = 0;
+        .execute = gfs2_recover_work,
+};
-        spin_lock(&sdp->sd_jindex_spin);
-        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-                if (jd->jd_dirty) {
-                        jd->jd_dirty = 0;
-                        found = 1;
-                        break;
-                }
-        }
-        spin_unlock(&sdp->sd_jindex_spin);
-        if (!found)
-                jd = NULL;
-        return jd;
+static int gfs2_recovery_wait(void *word)
-}
-/**
- * gfs2_check_journals - Recover any dirty journals
- * @sdp: the filesystem
- *
- */
-static void gfs2_check_journals(struct gfs2_sbd *sdp)
 {
-        struct gfs2_jdesc *jd;
+        schedule();
+        return 0;
-        for (;;) {
-                jd = gfs2_jdesc_find_dirty(sdp);
-                if (!jd)
-                        break;
-                if (jd != sdp->sd_jdesc)
-                        gfs2_recover_journal(jd);
-        }
 }
-/**
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-int gfs2_recoverd(void *data)
 {
-        struct gfs2_sbd *sdp = data;
+        int rv;
-        unsigned long t;
+        rv = slow_work_enqueue(&jd->jd_work);
+        if (rv)
-        while (!kthread_should_stop()) {
+                return rv;
-                gfs2_check_journals(sdp);
+        wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
-                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
        return 0;
 }
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index a8218ea15b57..1616ac22569a 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
                    struct gfs2_log_header_host *head);
 extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-extern int gfs2_recoverd(void *data);
+extern struct slow_work_ops gfs2_recover_ops;
 #endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa2..fba795798d3a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -29,7 +29,7 @@
 #include "util.h"
 #include "log.h"
 #include "inode.h"
-#include "ops_address.h"
+#include "trace_gfs2.h"
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
@@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
        }
        tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
-        if (count[1] + count[2] != tmp) {
+        if (count[1] != tmp) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used data mismatch:  %u != %u\n",
                               count[1], tmp);
                return;
        }
-        if (count[3] != rgd->rd_dinodes) {
+        if (count[2] + count[3] != rgd->rd_dinodes) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-                               count[3], rgd->rd_dinodes);
+                               count[2] + count[3], rgd->rd_dinodes);
                return;
        }
-        if (count[2] > count[3]) {
-                if (gfs2_consist_rgrpd(rgd))
-                        fs_err(sdp, "unlinked inodes > inodes:  %u\n",
-                               count[2]);
-                return;
-        }
 }
 static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
@@ -442,6 +434,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
        for (x = 0; x < length; x++) {
                bi = rgd->rd_bits + x;
+                bi->bi_flags = 0;
                /* small rgrp; bitmap stored completely in header block */
                if (length == 1) {
                        bytes = bytes_left;
@@ -580,7 +573,6 @@ static int read_rindex_entry(struct gfs2_inode *ip,
        rgd->rd_gl->gl_object = rgd;
        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
-        rgd->rd_flags |= GFS2_RDF_CHECK;
        return error;
 }
@@ -701,10 +693,9 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
        u32 rg_flags;
        rg_flags = be32_to_cpu(str->rg_flags);
-        if (rg_flags & GFS2_RGF_NOALLOC)
+        rg_flags &= ~GFS2_RDF_MASK;
-                rgd->rd_flags |= GFS2_RDF_NOALLOC;
+        rgd->rd_flags &= GFS2_RDF_MASK;
-        else
+        rgd->rd_flags |= rg_flags;
-                rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
        rgd->rd_free = be32_to_cpu(str->rg_free);
        rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
        rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -713,11 +704,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
        struct gfs2_rgrp *str = buf;
-        u32 rg_flags = 0;
-        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+        str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
-                rg_flags |= GFS2_RGF_NOALLOC;
-        str->rg_flags = cpu_to_be32(rg_flags);
        str->rg_free = cpu_to_be32(rgd->rd_free);
        str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
        str->__pad = cpu_to_be32(0);
@@ -775,8 +763,10 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
        }
        if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
+                for (x = 0; x < length; x++)
+                        clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
                gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
-                rgd->rd_flags |= GFS2_RDF_UPTODATE;
+                rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
        }
        spin_lock(&sdp->sd_rindex_spin);
@@ -845,7 +835,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
        struct super_block *sb = sdp->sd_vfs;
        struct block_device *bdev = sb->s_bdev;
        const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
-                                           bdev_hardsect_size(sb->s_bdev);
+                                           bdev_logical_block_size(sb->s_bdev);
        u64 blk;
        sector_t start = 0;
        sector_t nr_sects = 0;
@@ -903,6 +893,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
                        continue;
                if (sdp->sd_args.ar_discard)
                        gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
+                clear_bit(GBF_FULL, &bi->bi_flags);
                memcpy(bi->bi_clone + bi->bi_offset,
                       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
        }
@@ -942,7 +933,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        int ret = 0;
-        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+        if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
                return 0;
        spin_lock(&sdp->sd_rindex_spin);
@@ -962,7 +953,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 * Returns: The inode, if one has been found
 */
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
+static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+                                     u64 skip)
 {
        struct inode *inode;
        u32 goal = 0, block;
@@ -986,6 +978,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
                goal++;
                if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
                        continue;
+                if (no_addr == skip)
+                        continue;
                *last_unlinked = no_addr;
                inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
                                          no_addr, -1, 1);
@@ -1105,7 +1099,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked);
+                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
@@ -1139,7 +1133,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked);
+                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
@@ -1315,30 +1309,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 {
        struct gfs2_bitmap *bi = NULL;
        const u32 length = rgd->rd_length;
-        u32 blk = 0;
+        u32 blk = BFITNOENT;
        unsigned int buf, x;
        const unsigned int elen = *n;
-        const u8 *buffer;
+        const u8 *buffer = NULL;
        *n = 0;
        /* Find bitmap block that contains bits for goal block */
        for (buf = 0; buf < length; buf++) {
                bi = rgd->rd_bits + buf;
-                if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+                /* Convert scope of "goal" from rgrp-wide to within found bit block */
-                        break;
+                if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+                        goal -= bi->bi_start * GFS2_NBBY;
+                        goto do_search;
+                }
        }
+        buf = 0;
+        goal = 0;
-        gfs2_assert(rgd->rd_sbd, buf < length);
+do_search:
-        /* Convert scope of "goal" from rgrp-wide to within found bit block */
-        goal -= bi->bi_start * GFS2_NBBY;
        /* Search (up to entire) bitmap in this rgrp for allocatable block.
           "x <= length", instead of "x < length", because we typically start
           the search in the middle of a bit block, but if we can't find an
           allocatable block anywhere else, we want to be able wrap around and
           search in the first part of our first-searched bit block.  */
        for (x = 0; x <= length; x++) {
+                bi = rgd->rd_bits + buf;
+                if (test_bit(GBF_FULL, &bi->bi_flags) &&
+                    (old_state == GFS2_BLKST_FREE))
+                        goto skip;
                /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
                   bitmaps, so we must search the originals for that. */
                buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1349,33 +1350,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
                if (blk != BFITNOENT)
                        break;
+                if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
+                        set_bit(GBF_FULL, &bi->bi_flags);
                /* Try next bitmap block (wrap back to rgrp header if at end) */
-                buf = (buf + 1) % length;
+skip:
-                bi = rgd->rd_bits + buf;
+                buf++;
+                buf %= length;
                goal = 0;
        }
-        if (blk != BFITNOENT && old_state != new_state) {
+        if (blk == BFITNOENT)
-                *n = 1;
+                return blk;
-                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+        *n = 1;
+        if (old_state == new_state)
+                goto out;
+        gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+        gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
+                    bi->bi_len, blk, new_state);
+        goal = blk;
+        while (*n < elen) {
+                goal++;
+                if (goal >= (bi->bi_len * GFS2_NBBY))
+                        break;
+                if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+                    GFS2_BLKST_FREE)
+                        break;
                gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-                            bi->bi_len, blk, new_state);
+                            bi->bi_len, goal, new_state);
-                goal = blk;
+                (*n)++;
-                while (*n < elen) {
-                        goal++;
-                        if (goal >= (bi->bi_len * GFS2_NBBY))
-                                break;
-                        if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
-                            GFS2_BLKST_FREE)
-                                break;
-                        gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
-                                    bi->bi_offset, bi->bi_len, goal,
-                                    new_state);
-                        (*n)++;
-                }
        }
+out:
-        return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
+        return (bi->bi_start * GFS2_NBBY) + blk;
 }
 /**
@@ -1435,13 +1442,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 }
 /**
- * gfs2_alloc_block - Allocate a block
+ * gfs2_rgrp_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+        const struct gfs2_rgrpd *rgd = gl->gl_object;
+        if (rgd == NULL)
+                return 0;
+        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+        return 0;
+}
+/**
+ * gfs2_alloc_block - Allocate one or more blocks
 * @ip: the inode to allocate the block for
+ * @bn: Used to return the starting block number
+ * @n: requested number of blocks/extent length (value/result)
 *
- * Returns: the allocated block
+ * Returns: 0 or error
 */
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
+int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct buffer_head *dibh;
@@ -1457,7 +1484,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
                goal = rgd->rd_last_alloc;
        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
-        BUG_ON(blk == BFITNOENT);
+        /* Since all blocks are reserved in advance, this shouldn't happen */
+        if (blk == BFITNOENT)
+                goto rgrp_error;
        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
@@ -1469,7 +1499,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
                di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
                brelse(dibh);
        }
-        gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
+        if (rgd->rd_free < *n)
+                goto rgrp_error;
        rgd->rd_free -= *n;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1483,8 +1515,17 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
        spin_lock(&sdp->sd_rindex_spin);
        rgd->rd_free_clone -= *n;
        spin_unlock(&sdp->sd_rindex_spin);
+        trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
+        *bn = block;
+        return 0;
-        return block;
+rgrp_error:
+        fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+                (unsigned long long)rgd->rd_addr);
+        fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+        gfs2_rgrp_dump(NULL, rgd->rd_gl);
+        rgd->rd_flags |= GFS2_RDF_ERROR;
+        return -EIO;
 }
 /**
@@ -1526,7 +1567,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        spin_lock(&sdp->sd_rindex_spin);
        rgd->rd_free_clone--;
        spin_unlock(&sdp->sd_rindex_spin);
+        trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
        return block;
 }
@@ -1546,7 +1587,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
        if (!rgd)
                return;
+        trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1574,7 +1615,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
        if (!rgd)
                return;
+        trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1597,6 +1638,7 @@ void gfs2_unlink_di(struct inode *inode)
        rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
        if (!rgd)
                return;
+        trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
@@ -1628,6 +1670,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 {
        gfs2_free_uninit_di(rgd, ip->i_no_addr);
+        trace_gfs2_block_alloc(ip, ip->i_no_addr, 1, GFS2_BLKST_FREE);
        gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
        gfs2_meta_wipe(ip, ip->i_no_addr, 1);
 }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3181c7e624bf..1e76ff0f3e00 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -14,22 +14,22 @@ struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
-void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
-struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 {
        BUG_ON(ip->i_alloc == NULL);
@@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
        ip->i_alloc = NULL;
 }
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
-                         char *file, unsigned int line);
+                                  unsigned int line);
 #define gfs2_inplace_reserve(ip) \
 gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
-void gfs2_inplace_release(struct gfs2_inode *ip);
+extern void gfs2_inplace_release(struct gfs2_inode *ip);
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
+extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
-void gfs2_unlink_di(struct inode *inode);
+extern void gfs2_unlink_di(struct inode *inode);
 struct gfs2_rgrp_list {
        unsigned int rl_rgrps;
@@ -61,10 +61,11 @@ struct gfs2_rgrp_list {
        struct gfs2_holder *rl_ghs;
 };
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
-                    u64 block);
+                           u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
-void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
-u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 601913e0a482..f522bb017973 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,14 +7,20 @@
 * of the GNU General Public License version 2.
 */
+#include <linux/bio.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/crc32.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/bio.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -31,6 +37,183 @@
 #include "super.h"
 #include "trans.h"
 #include "util.h"
+#include "sys.h"
+#include "eattr.h"
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+enum {
+        Opt_lockproto,
+        Opt_locktable,
+        Opt_hostdata,
+        Opt_spectator,
+        Opt_ignore_local_fs,
+        Opt_localflocks,
+        Opt_localcaching,
+        Opt_debug,
+        Opt_nodebug,
+        Opt_upgrade,
+        Opt_acl,
+        Opt_noacl,
+        Opt_quota_off,
+        Opt_quota_account,
+        Opt_quota_on,
+        Opt_quota,
+        Opt_noquota,
+        Opt_suiddir,
+        Opt_nosuiddir,
+        Opt_data_writeback,
+        Opt_data_ordered,
+        Opt_meta,
+        Opt_discard,
+        Opt_nodiscard,
+        Opt_commit,
+        Opt_error,
+};
+static const match_table_t tokens = {
+        {Opt_lockproto, "lockproto=%s"},
+        {Opt_locktable, "locktable=%s"},
+        {Opt_hostdata, "hostdata=%s"},
+        {Opt_spectator, "spectator"},
+        {Opt_ignore_local_fs, "ignore_local_fs"},
+        {Opt_localflocks, "localflocks"},
+        {Opt_localcaching, "localcaching"},
+        {Opt_debug, "debug"},
+        {Opt_nodebug, "nodebug"},
+        {Opt_upgrade, "upgrade"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_quota_off, "quota=off"},
+        {Opt_quota_account, "quota=account"},
+        {Opt_quota_on, "quota=on"},
+        {Opt_quota, "quota"},
+        {Opt_noquota, "noquota"},
+        {Opt_suiddir, "suiddir"},
+        {Opt_nosuiddir, "nosuiddir"},
+        {Opt_data_writeback, "data=writeback"},
+        {Opt_data_ordered, "data=ordered"},
+        {Opt_meta, "meta"},
+        {Opt_discard, "discard"},
+        {Opt_nodiscard, "nodiscard"},
+        {Opt_commit, "commit=%d"},
+        {Opt_error, NULL}
+};
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+{
+        char *o;
+        int token;
+        substring_t tmp[MAX_OPT_ARGS];
+        int rv;
+        /* Split the options into tokens with the "," character and
+           process them */
+        while (1) {
+                o = strsep(&options, ",");
+                if (o == NULL)
+                        break;
+                if (*o == '\0')
+                        continue;
+                token = match_token(o, tokens, tmp);
+                switch (token) {
+                case Opt_lockproto:
+                        match_strlcpy(args->ar_lockproto, &tmp[0],
+                                      GFS2_LOCKNAME_LEN);
+                        break;
+                case Opt_locktable:
+                        match_strlcpy(args->ar_locktable, &tmp[0],
+                                      GFS2_LOCKNAME_LEN);
+                        break;
+                case Opt_hostdata:
+                        match_strlcpy(args->ar_hostdata, &tmp[0],
+                                      GFS2_LOCKNAME_LEN);
+                        break;
+                case Opt_spectator:
+                        args->ar_spectator = 1;
+                        break;
+                case Opt_ignore_local_fs:
+                        args->ar_ignore_local_fs = 1;
+                        break;
+                case Opt_localflocks:
+                        args->ar_localflocks = 1;
+                        break;
+                case Opt_localcaching:
+                        args->ar_localcaching = 1;
+                        break;
+                case Opt_debug:
+                        args->ar_debug = 1;
+                        break;
+                case Opt_nodebug:
+                        args->ar_debug = 0;
+                        break;
+                case Opt_upgrade:
+                        args->ar_upgrade = 1;
+                        break;
+                case Opt_acl:
+                        args->ar_posix_acl = 1;
+                        break;
+                case Opt_noacl:
+                        args->ar_posix_acl = 0;
+                        break;
+                case Opt_quota_off:
+                case Opt_noquota:
+                        args->ar_quota = GFS2_QUOTA_OFF;
+                        break;
+                case Opt_quota_account:
+                        args->ar_quota = GFS2_QUOTA_ACCOUNT;
+                        break;
+                case Opt_quota_on:
+                case Opt_quota:
+                        args->ar_quota = GFS2_QUOTA_ON;
+                        break;
+                case Opt_suiddir:
+                        args->ar_suiddir = 1;
+                        break;
+                case Opt_nosuiddir:
+                        args->ar_suiddir = 0;
+                        break;
+                case Opt_data_writeback:
+                        args->ar_data = GFS2_DATA_WRITEBACK;
+                        break;
+                case Opt_data_ordered:
+                        args->ar_data = GFS2_DATA_ORDERED;
+                        break;
+                case Opt_meta:
+                        args->ar_meta = 1;
+                        break;
+                case Opt_discard:
+                        args->ar_discard = 1;
+                        break;
+                case Opt_nodiscard:
+                        args->ar_discard = 0;
+                        break;
+                case Opt_commit:
+                        rv = match_int(&tmp[0], &args->ar_commit);
+                        if (rv || args->ar_commit <= 0) {
+                                fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_error:
+                default:
+                        fs_info(sdp, "invalid mount option: %s\n", o);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
 /**
 * gfs2_jindex_free - Clear all the journal index information
@@ -170,7 +353,7 @@ fail:
        return error;
 }
-static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
        const struct gfs2_statfs_change *str = buf;
@@ -258,6 +441,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
        brelse(l_bh);
 }
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+                   struct buffer_head *l_bh)
+{
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        spin_lock(&sdp->sd_statfs_spin);
+        m_sc->sc_total += l_sc->sc_total;
+        m_sc->sc_free += l_sc->sc_free;
+        m_sc->sc_dinodes += l_sc->sc_dinodes;
+        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+               0, sizeof(struct gfs2_statfs_change));
+        spin_unlock(&sdp->sd_statfs_spin);
+        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+}
 int gfs2_statfs_sync(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -294,19 +500,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
        if (error)
                goto out_bh2;
-        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        update_statfs(sdp, m_bh, l_bh);
-        spin_lock(&sdp->sd_statfs_spin);
-        m_sc->sc_total += l_sc->sc_total;
-        m_sc->sc_free += l_sc->sc_free;
-        m_sc->sc_dinodes += l_sc->sc_dinodes;
-        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
-        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
-               0, sizeof(struct gfs2_statfs_change));
-        spin_unlock(&sdp->sd_statfs_spin);
-        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
        gfs2_trans_end(sdp);
@@ -436,3 +630,707 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
        mutex_unlock(&sdp->sd_freeze_lock);
 }
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_holder gh;
+        struct buffer_head *bh;
+        struct timespec atime;
+        struct gfs2_dinode *di;
+        int ret = 0;
+        /* Check this is a "normal" inode, etc */
+        if (!test_bit(GIF_USER, &ip->i_flags) ||
+            (current->flags & PF_MEMALLOC))
+                return 0;
+        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (ret)
+                goto do_flush;
+        ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (ret)
+                goto do_unlock;
+        ret = gfs2_meta_inode_buffer(ip, &bh);
+        if (ret == 0) {
+                di = (struct gfs2_dinode *)bh->b_data;
+                atime.tv_sec = be64_to_cpu(di->di_atime);
+                atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+                if (timespec_compare(&inode->i_atime, &atime) > 0) {
+                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        gfs2_dinode_out(ip, bh->b_data);
+                }
+                brelse(bh);
+        }
+        gfs2_trans_end(sdp);
+do_unlock:
+        gfs2_glock_dq_uninit(&gh);
+do_flush:
+        if (sync != 0)
+                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        return ret;
+}
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+        struct gfs2_holder t_gh;
+        int error;
+        flush_workqueue(gfs2_delete_workqueue);
+        gfs2_quota_sync(sdp);
+        gfs2_statfs_sync(sdp);
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+                                   &t_gh);
+        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return error;
+        gfs2_meta_syncfs(sdp);
+        gfs2_log_shutdown(sdp);
+        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+        if (t_gh.gh_gl)
+                gfs2_glock_dq_uninit(&t_gh);
+        gfs2_quota_cleanup(sdp);
+        return error;
+}
+static int gfs2_umount_recovery_wait(void *word)
+{
+        schedule();
+        return 0;
+}
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+static void gfs2_put_super(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        struct gfs2_jdesc *jd;
+        /*  Unfreeze the filesystem, if we need to  */
+        mutex_lock(&sdp->sd_freeze_lock);
+        if (sdp->sd_freeze_count)
+                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+        mutex_unlock(&sdp->sd_freeze_lock);
+        /* No more recovery requests */
+        set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+        smp_mb();
+        /* Wait on outstanding recovery */
+restart:
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+                        continue;
+                spin_unlock(&sdp->sd_jindex_spin);
+                wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+                            gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+                goto restart;
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+        kthread_stop(sdp->sd_quotad_process);
+        kthread_stop(sdp->sd_logd_process);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                error = gfs2_make_fs_ro(sdp);
+                if (error)
+                        gfs2_io_error(sdp);
+        }
+        /*  At this point, we're through modifying the disk  */
+        /*  Release stuff  */
+        iput(sdp->sd_jindex);
+        iput(sdp->sd_inum_inode);
+        iput(sdp->sd_statfs_inode);
+        iput(sdp->sd_rindex);
+        iput(sdp->sd_quota_inode);
+        gfs2_glock_put(sdp->sd_rename_gl);
+        gfs2_glock_put(sdp->sd_trans_gl);
+        if (!sdp->sd_args.ar_spectator) {
+                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+                iput(sdp->sd_ir_inode);
+                iput(sdp->sd_sc_inode);
+                iput(sdp->sd_qc_inode);
+        }
+        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+        gfs2_clear_rgrpd(sdp);
+        gfs2_jindex_free(sdp);
+        /*  Take apart glock structures and buffer lists  */
+        gfs2_gl_hash_clear(sdp);
+        /*  Unmount the locking protocol  */
+        gfs2_lm_unmount(sdp);
+        /*  At this point, we're through participating in the lockspace  */
+        gfs2_sys_fs_del(sdp);
+}
+/**
+ * gfs2_sync_fs - sync the filesystem
+ * @sb: the superblock
+ *
+ * Flushes the log to disk.
+ */
+static int gfs2_sync_fs(struct super_block *sb, int wait)
+{
+        if (wait && sb->s_fs_info)
+                gfs2_log_flush(sb->s_fs_info, NULL);
+        return 0;
+}
+/**
+ * gfs2_freeze - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static int gfs2_freeze(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return -EINVAL;
+        for (;;) {
+                error = gfs2_freeze_fs(sdp);
+                if (!error)
+                        break;
+                switch (error) {
+                case -EBUSY:
+                        fs_err(sdp, "waiting for recovery before freeze\n");
+                        break;
+                default:
+                        fs_err(sdp, "error freezing FS: %d\n", error);
+                        break;
+                }
+                fs_err(sdp, "retrying...\n");
+                msleep(1000);
+        }
+        return 0;
+}
+/**
+ * gfs2_unfreeze - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static int gfs2_unfreeze(struct super_block *sb)
+{
+        gfs2_unfreeze_fs(sb->s_fs_info);
+        return 0;
+}
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+                            struct gfs2_statfs_change_host *sc)
+{
+        gfs2_rgrp_verify(rgd);
+        sc->sc_total += rgd->rd_data;
+        sc->sc_free += rgd->rd_free;
+        sc->sc_dinodes += rgd->rd_dinodes;
+        return 0;
+}
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_holder ri_gh;
+        struct gfs2_rgrpd *rgd_next;
+        struct gfs2_holder *gha, *gh;
+        unsigned int slots = 64;
+        unsigned int x;
+        int done;
+        int error = 0, err;
+        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!gha)
+                return -ENOMEM;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto out;
+        rgd_next = gfs2_rgrpd_get_first(sdp);
+        for (;;) {
+                done = 1;
+                for (x = 0; x < slots; x++) {
+                        gh = gha + x;
+                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
+                                err = gfs2_glock_wait(gh);
+                                if (err) {
+                                        gfs2_holder_uninit(gh);
+                                        error = err;
+                                } else {
+                                        if (!error)
+                                                error = statfs_slow_fill(
+                                                        gh->gh_gl->gl_object, sc);
+                                        gfs2_glock_dq_uninit(gh);
+                                }
+                        }
+                        if (gh->gh_gl)
+                                done = 0;
+                        else if (rgd_next && !error) {
+                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
+                                                           LM_ST_SHARED,
+                                                           GL_ASYNC,
+                                                           gh);
+                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
+                                done = 0;
+                        }
+                        if (signal_pending(current))
+                                error = -ERESTARTSYS;
+                }
+                if (done)
+                        break;
+                yield();
+        }
+        gfs2_glock_dq_uninit(&ri_gh);
+out:
+        kfree(gha);
+        return error;
+}
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        spin_lock(&sdp->sd_statfs_spin);
+        *sc = *m_sc;
+        sc->sc_total += l_sc->sc_total;
+        sc->sc_free += l_sc->sc_free;
+        sc->sc_dinodes += l_sc->sc_dinodes;
+        spin_unlock(&sdp->sd_statfs_spin);
+        if (sc->sc_free < 0)
+                sc->sc_free = 0;
+        if (sc->sc_free > sc->sc_total)
+                sc->sc_free = sc->sc_total;
+        if (sc->sc_dinodes < 0)
+                sc->sc_dinodes = 0;
+        return 0;
+}
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_inode->i_sb;
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_statfs_change_host sc;
+        int error;
+        if (gfs2_tune_get(sdp, gt_statfs_slow))
+                error = gfs2_statfs_slow(sdp, &sc);
+        else
+                error = gfs2_statfs_i(sdp, &sc);
+        if (error)
+                return error;
+        buf->f_type = GFS2_MAGIC;
+        buf->f_bsize = sdp->sd_sb.sb_bsize;
+        buf->f_blocks = sc.sc_total;
+        buf->f_bfree = sc.sc_free;
+        buf->f_bavail = sc.sc_free;
+        buf->f_files = sc.sc_dinodes + sc.sc_free;
+        buf->f_ffree = sc.sc_free;
+        buf->f_namelen = GFS2_FNAMESIZE;
+        return 0;
+}
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+        struct gfs2_tune *gt = &sdp->sd_tune;
+        int error;
+        spin_lock(&gt->gt_spin);
+        args.ar_commit = gt->gt_log_flush_secs;
+        spin_unlock(&gt->gt_spin);
+        error = gfs2_mount_args(sdp, &args, data);
+        if (error)
+                return error;
+        /* Not allowed to change locking details */
+        if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+            strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+            strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+                return -EINVAL;
+        /* Some flags must not be changed */
+        if (args_neq(&args, &sdp->sd_args, spectator) ||
+            args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
+            args_neq(&args, &sdp->sd_args, localflocks) ||
+            args_neq(&args, &sdp->sd_args, localcaching) ||
+            args_neq(&args, &sdp->sd_args, meta))
+                return -EINVAL;
+        if (sdp->sd_args.ar_spectator)
+                *flags |= MS_RDONLY;
+        if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+                if (*flags & MS_RDONLY)
+                        error = gfs2_make_fs_ro(sdp);
+                else
+                        error = gfs2_make_fs_rw(sdp);
+                if (error)
+                        return error;
+        }
+        sdp->sd_args = args;
+        if (sdp->sd_args.ar_posix_acl)
+                sb->s_flags |= MS_POSIXACL;
+        else
+                sb->s_flags &= ~MS_POSIXACL;
+        spin_lock(&gt->gt_spin);
+        gt->gt_log_flush_secs = args.ar_commit;
+        spin_unlock(&gt->gt_spin);
+        return 0;
+}
+/**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+static void gfs2_drop_inode(struct inode *inode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
+                struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        clear_nlink(inode);
+        }
+        generic_drop_inode(inode);
+}
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+static void gfs2_clear_inode(struct inode *inode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        /* This tells us its a "real" inode and not one which only
+         * serves to contain an address space (see rgrp.c, meta_io.c)
+         * which therefore doesn't have its own glocks.
+         */
+        if (test_bit(GIF_USER, &ip->i_flags)) {
+                ip->i_gl->gl_object = NULL;
+                gfs2_glock_put(ip->i_gl);
+                ip->i_gl = NULL;
+                if (ip->i_iopen_gh.gh_gl) {
+                        ip->i_iopen_gh.gh_gl->gl_object = NULL;
+                        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+                }
+        }
+}
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+        do {
+                if (d1 == d2)
+                        return 1;
+                d1 = d1->d_parent;
+        } while (!IS_ROOT(d1));
+        return 0;
+}
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+        struct gfs2_args *args = &sdp->sd_args;
+        int lfsecs;
+        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+                seq_printf(s, ",meta");
+        if (args->ar_lockproto[0])
+                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+        if (args->ar_locktable[0])
+                seq_printf(s, ",locktable=%s", args->ar_locktable);
+        if (args->ar_hostdata[0])
+                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+        if (args->ar_spectator)
+                seq_printf(s, ",spectator");
+        if (args->ar_ignore_local_fs)
+                seq_printf(s, ",ignore_local_fs");
+        if (args->ar_localflocks)
+                seq_printf(s, ",localflocks");
+        if (args->ar_localcaching)
+                seq_printf(s, ",localcaching");
+        if (args->ar_debug)
+                seq_printf(s, ",debug");
+        if (args->ar_upgrade)
+                seq_printf(s, ",upgrade");
+        if (args->ar_posix_acl)
+                seq_printf(s, ",acl");
+        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+                char *state;
+                switch (args->ar_quota) {
+                case GFS2_QUOTA_OFF:
+                        state = "off";
+                        break;
+                case GFS2_QUOTA_ACCOUNT:
+                        state = "account";
+                        break;
+                case GFS2_QUOTA_ON:
+                        state = "on";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",quota=%s", state);
+        }
+        if (args->ar_suiddir)
+                seq_printf(s, ",suiddir");
+        if (args->ar_data != GFS2_DATA_DEFAULT) {
+                char *state;
+                switch (args->ar_data) {
+                case GFS2_DATA_WRITEBACK:
+                        state = "writeback";
+                        break;
+                case GFS2_DATA_ORDERED:
+                        state = "ordered";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",data=%s", state);
+        }
+        if (args->ar_discard)
+                seq_printf(s, ",discard");
+        lfsecs = sdp->sd_tune.gt_log_flush_secs;
+        if (lfsecs != 60)
+                seq_printf(s, ",commit=%d", lfsecs);
+        return 0;
+}
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+static void gfs2_delete_inode(struct inode *inode)
+{
+        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        if (!test_bit(GIF_USER, &ip->i_flags))
+                goto out;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (unlikely(error)) {
+                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+                goto out;
+        }
+        gfs2_glock_dq_wait(&ip->i_iopen_gh);
+        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+        error = gfs2_glock_nq(&ip->i_iopen_gh);
+        if (error)
+                goto out_truncate;
+        if (S_ISDIR(inode->i_mode) &&
+            (ip->i_diskflags & GFS2_DIF_EXHASH)) {
+                error = gfs2_dir_exhash_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (ip->i_eattr) {
+                error = gfs2_ea_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (!gfs2_is_stuffed(ip)) {
+                error = gfs2_file_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        error = gfs2_dinode_dealloc(ip);
+        if (error)
+                goto out_unlock;
+out_truncate:
+        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+        if (error)
+                goto out_unlock;
+        /* Needs to be done before glock release & also in a transaction */
+        truncate_inode_pages(&inode->i_data, 0);
+        gfs2_trans_end(sdp);
+out_unlock:
+        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+                gfs2_glock_dq(&ip->i_iopen_gh);
+        gfs2_holder_uninit(&ip->i_iopen_gh);
+        gfs2_glock_dq_uninit(&gh);
+        if (error && error != GLR_TRYFAILED && error != -EROFS)
+                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+        truncate_inode_pages(&inode->i_data, 0);
+        clear_inode(inode);
+}
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+        struct gfs2_inode *ip;
+        ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+        if (ip) {
+                ip->i_flags = 0;
+                ip->i_gl = NULL;
+        }
+        return &ip->i_inode;
+}
+static void gfs2_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(gfs2_inode_cachep, inode);
+}
+const struct super_operations gfs2_super_ops = {
+        .alloc_inode            = gfs2_alloc_inode,
+        .destroy_inode          = gfs2_destroy_inode,
+        .write_inode            = gfs2_write_inode,
+        .delete_inode           = gfs2_delete_inode,
+        .put_super              = gfs2_put_super,
+        .sync_fs                = gfs2_sync_fs,
+        .freeze_fs              = gfs2_freeze,
+        .unfreeze_fs            = gfs2_unfreeze,
+        .statfs                 = gfs2_statfs,
+        .remount_fs             = gfs2_remount_fs,
+        .clear_inode            = gfs2_clear_inode,
+        .drop_inode             = gfs2_drop_inode,
+        .show_options           = gfs2_show_options,
+};
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b56413e3e40d..22e0417ed996 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -40,6 +40,10 @@ extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
                               s64 dinodes);
+extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+                                  const void *buf);
+extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+                          struct buffer_head *l_bh);
 extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
 extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7655f5025fec..a7cbfbd340c7 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,6 +26,36 @@
 #include "util.h"
 #include "glops.h"
+struct gfs2_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->show ? a->show(sdp, buf) : 0;
+}
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->store ? a->store(sdp, buf, len) : len;
+}
+static struct sysfs_ops gfs2_attr_ops = {
+        .show  = gfs2_attr_show,
+        .store = gfs2_attr_store,
+};
+static struct kset *gfs2_kset;
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
        return len;
 }
-struct gfs2_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
 #define GFS2_ATTR(name, mode, show, store) \
 static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
@@ -246,58 +271,11 @@ static struct attribute *gfs2_attrs[] = {
        NULL,
 };
-static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
-                              char *buf)
-{
-        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-        return a->show ? a->show(sdp, buf) : 0;
-}
-static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
-                               const char *buf, size_t len)
-{
-        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-        return a->store ? a->store(sdp, buf, len) : len;
-}
-static struct sysfs_ops gfs2_attr_ops = {
-        .show  = gfs2_attr_show,
-        .store = gfs2_attr_store,
-};
 static struct kobj_type gfs2_ktype = {
        .default_attrs = gfs2_attrs,
        .sysfs_ops     = &gfs2_attr_ops,
 };
-static struct kset *gfs2_kset;
-/*
- * display struct lm_lockstruct fields
- */
-struct lockstruct_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-#define LOCKSTRUCT_ATTR(name, fmt)                                          \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
-}                                                                           \
-static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
-LOCKSTRUCT_ATTR(jid,      "%u\n");
-LOCKSTRUCT_ATTR(first,    "%u\n");
-static struct attribute *lockstruct_attrs[] = {
-        &lockstruct_attr_jid.attr,
-        &lockstruct_attr_first.attr,
-        NULL,
-};
 /*
 * lock_module. Originally from lock_dlm
@@ -359,34 +337,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%d\n", ls->ls_first_done);
 }
-static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        return sprintf(buf, "%d\n", ls->ls_recover_jid);
-}
-static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
 {
+        unsigned jid;
        struct gfs2_jdesc *jd;
+        int rv;
+        rv = sscanf(buf, "%u", &jid);
+        if (rv != 1)
+                return -EINVAL;
+        rv = -ESHUTDOWN;
        spin_lock(&sdp->sd_jindex_spin);
+        if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+                goto out;
+        rv = -EBUSY;
+        if (sdp->sd_jdesc->jd_jid == jid)
+                goto out;
+        rv = -ENOENT;
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                if (jd->jd_jid != jid)
                        continue;
-                jd->jd_dirty = 1;
+                rv = slow_work_enqueue(&jd->jd_work);
                break;
        }
+out:
        spin_unlock(&sdp->sd_jindex_spin);
-}
+        return rv ? rv : len;
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
-        gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
-        if (sdp->sd_recoverd_process)
-                wake_up_process(sdp->sd_recoverd_process);
-        return len;
 }
 static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
@@ -401,31 +378,31 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
 }
-struct gdlm_attr {
+static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
-        struct attribute attr;
+{
-        ssize_t (*show)(struct gfs2_sbd *sdp, char *);
+        return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
-        ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
+}
-};
 #define GDLM_ATTR(_name,_mode,_show,_store) \
-static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
+GDLM_ATTR(proto_name,           0444, proto_name_show,          NULL);
-GDLM_ATTR(block,          0644, block_show,          block_store);
+GDLM_ATTR(block,                0644, block_show,               block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
+GDLM_ATTR(withdraw,             0644, withdraw_show,            withdraw_store);
-GDLM_ATTR(id,             0444, lkid_show,           NULL);
+GDLM_ATTR(id,                   0444, lkid_show,                NULL);
-GDLM_ATTR(first,          0444, lkfirst_show,        NULL);
+GDLM_ATTR(jid,                  0444, jid_show,                 NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
+GDLM_ATTR(first,                0444, lkfirst_show,             NULL);
-GDLM_ATTR(recover,        0644, recover_show,        recover_store);
+GDLM_ATTR(first_done,           0444, first_done_show,          NULL);
-GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
+GDLM_ATTR(recover,              0600, NULL,                     recover_store);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+GDLM_ATTR(recover_done,         0444, recover_done_show,        NULL);
+GDLM_ATTR(recover_status,       0444, recover_status_show,      NULL);
 static struct attribute *lock_module_attrs[] = {
        &gdlm_attr_proto_name.attr,
        &gdlm_attr_block.attr,
        &gdlm_attr_withdraw.attr,
        &gdlm_attr_id.attr,
-        &lockstruct_attr_jid.attr,
+        &gdlm_attr_jid.attr,
        &gdlm_attr_first.attr,
        &gdlm_attr_first_done.attr,
        &gdlm_attr_recover.attr,
@@ -435,53 +412,6 @@ static struct attribute *lock_module_attrs[] = {
 };
 /*
- * display struct gfs2_args fields
- */
-struct args_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-#define ARGS_ATTR(name, fmt)                                                \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
-}                                                                           \
-static struct args_attr args_attr_##name = __ATTR_RO(name)
-ARGS_ATTR(lockproto,       "%s\n");
-ARGS_ATTR(locktable,       "%s\n");
-ARGS_ATTR(hostdata,        "%s\n");
-ARGS_ATTR(spectator,       "%d\n");
-ARGS_ATTR(ignore_local_fs, "%d\n");
-ARGS_ATTR(localcaching,    "%d\n");
-ARGS_ATTR(localflocks,     "%d\n");
-ARGS_ATTR(debug,           "%d\n");
-ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(posix_acl,       "%d\n");
-ARGS_ATTR(quota,           "%u\n");
-ARGS_ATTR(suiddir,         "%d\n");
-ARGS_ATTR(data,            "%d\n");
-static struct attribute *args_attrs[] = {
-        &args_attr_lockproto.attr,
-        &args_attr_locktable.attr,
-        &args_attr_hostdata.attr,
-        &args_attr_spectator.attr,
-        &args_attr_ignore_local_fs.attr,
-        &args_attr_localcaching.attr,
-        &args_attr_localflocks.attr,
-        &args_attr_debug.attr,
-        &args_attr_upgrade.attr,
-        &args_attr_posix_acl.attr,
-        &args_attr_quota.attr,
-        &args_attr_suiddir.attr,
-        &args_attr_data.attr,
-        NULL,
-};
-/*
 * get and set struct gfs2_tune fields
 */
@@ -531,14 +461,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
        return len;
 }
-struct tune_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
 #define TUNE_ATTR_3(name, show, store)                                        \
-static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
 #define TUNE_ATTR_2(name, store)                                              \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
@@ -554,15 +478,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-#define TUNE_ATTR_DAEMON(name, process)                                       \
-static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
-{                                                                             \
-        ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len);      \
-        wake_up_process(sdp->sd_##process);                                   \
-        return r;                                                             \
-}                                                                             \
-TUNE_ATTR_2(name, name##_store)
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -574,8 +489,6 @@ TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
-TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
-TUNE_ATTR_DAEMON(logd_secs, logd_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
@@ -589,23 +502,11 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_quota_simul_sync.attr,
        &tune_attr_stall_secs.attr,
        &tune_attr_statfs_quantum.attr,
-        &tune_attr_recoverd_secs.attr,
-        &tune_attr_logd_secs.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
        NULL,
 };
-static struct attribute_group lockstruct_group = {
-        .name = "lockstruct",
-        .attrs = lockstruct_attrs,
-};
-static struct attribute_group args_group = {
-        .name = "args",
-        .attrs = args_attrs,
-};
 static struct attribute_group tune_group = {
        .name = "tune",
        .attrs = tune_attrs,
@@ -626,17 +527,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
        if (error)
                goto fail;
-        error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
-        if (error)
-                goto fail_reg;
-        error = sysfs_create_group(&sdp->sd_kobj, &args_group);
-        if (error)
-                goto fail_lockstruct;
        error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
        if (error)
-                goto fail_args;
+                goto fail_reg;
        error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
        if (error)
@@ -647,10 +540,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 fail_tune:
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-fail_args:
-        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_lockstruct:
-        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
        kobject_put(&sdp->sd_kobj);
 fail:
@@ -661,8 +550,6 @@ fail:
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
        sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
        kobject_put(&sdp->sd_kobj);
 }
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
new file mode 100644
index 000000000000..148d55c14171
--- /dev/null
+++ b/fs/gfs2/trace_gfs2.h
@@ -0,0 +1,407 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gfs2
+#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_GFS2_H
+#include <linux/tracepoint.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/dlmconstants.h>
+#include <linux/gfs2_ondisk.h>
+#include "incore.h"
+#include "glock.h"
+#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
+#define glock_trace_name(x) __print_symbolic(x,         \
+                            dlm_state_name(IV),         \
+                            dlm_state_name(NL),         \
+                            dlm_state_name(CR),         \
+                            dlm_state_name(CW),         \
+                            dlm_state_name(PR),         \
+                            dlm_state_name(PW),         \
+                            dlm_state_name(EX))
+#define block_state_name(x) __print_symbolic(x,                 \
+                            { GFS2_BLKST_FREE, "free" },        \
+                            { GFS2_BLKST_USED, "used" },        \
+                            { GFS2_BLKST_DINODE, "dinode" },    \
+                            { GFS2_BLKST_UNLINKED, "unlinked" })
+#define show_glock_flags(flags) __print_flags(flags, "",        \
+        {(1UL << GLF_LOCK),                     "l" },          \
+        {(1UL << GLF_DEMOTE),                   "D" },          \
+        {(1UL << GLF_PENDING_DEMOTE),           "d" },          \
+        {(1UL << GLF_DEMOTE_IN_PROGRESS),       "p" },          \
+        {(1UL << GLF_DIRTY),                    "y" },          \
+        {(1UL << GLF_LFLUSH),                   "f" },          \
+        {(1UL << GLF_INVALIDATE_IN_PROGRESS),   "i" },          \
+        {(1UL << GLF_REPLY_PENDING),            "r" },          \
+        {(1UL << GLF_INITIAL),                  "I" },          \
+        {(1UL << GLF_FROZEN),                   "F" })
+#ifndef NUMPTY
+#define NUMPTY
+static inline u8 glock_trace_state(unsigned int state)
+{
+        switch(state) {
+        case LM_ST_SHARED:
+                return DLM_LOCK_PR;
+        case LM_ST_DEFERRED:
+                return DLM_LOCK_CW;
+        case LM_ST_EXCLUSIVE:
+                return DLM_LOCK_EX;
+        }
+        return DLM_LOCK_NL;
+}
+#endif
+/* Section 1 - Locking
+ *
+ * Objectives:
+ * Latency: Remote demote request to state change
+ * Latency: Local lock request to state change
+ * Latency: State change to lock grant
+ * Correctness: Ordering of local lock state vs. I/O requests
+ * Correctness: Responses to remote demote requests
+ */
+/* General glock state change (DLM lock request completes) */
+TRACE_EVENT(gfs2_glock_state_change,
+        TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state),
+        TP_ARGS(gl, new_state),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        u8,     cur_state               )
+                __field(        u8,     new_state               )
+                __field(        u8,     dmt_state               )
+                __field(        u8,     tgt_state               )
+                __field(        unsigned long,  flags           )
+        ),
+        TP_fast_assign(
+                __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+                __entry->glnum          = gl->gl_name.ln_number;
+                __entry->gltype         = gl->gl_name.ln_type;
+                __entry->cur_state      = glock_trace_state(gl->gl_state);
+                __entry->new_state      = glock_trace_state(new_state);
+                __entry->tgt_state      = glock_trace_state(gl->gl_target);
+                __entry->dmt_state      = glock_trace_state(gl->gl_demote_state);
+                __entry->flags          = gl->gl_flags;
+        ),
+        TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+                 (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+                  glock_trace_name(__entry->new_state),
+                  glock_trace_name(__entry->tgt_state),
+                  glock_trace_name(__entry->dmt_state),
+                  show_glock_flags(__entry->flags))
+);
+/* State change -> unlocked, glock is being deallocated */
+TRACE_EVENT(gfs2_glock_put,
+        TP_PROTO(const struct gfs2_glock *gl),
+        TP_ARGS(gl),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        u8,     cur_state               )
+                __field(        unsigned long,  flags           )
+        ),
+        TP_fast_assign(
+                __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+                __entry->gltype         = gl->gl_name.ln_type;
+                __entry->glnum          = gl->gl_name.ln_number;
+                __entry->cur_state      = glock_trace_state(gl->gl_state);
+                __entry->flags          = gl->gl_flags;
+        ),
+        TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->gltype, (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+                  glock_trace_name(DLM_LOCK_IV),
+                  show_glock_flags(__entry->flags))
+);
+/* Callback (local or remote) requesting lock demotion */
+TRACE_EVENT(gfs2_demote_rq,
+        TP_PROTO(const struct gfs2_glock *gl),
+        TP_ARGS(gl),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        u8,     cur_state               )
+                __field(        u8,     dmt_state               )
+                __field(        unsigned long,  flags           )
+        ),
+        TP_fast_assign(
+                __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+                __entry->gltype         = gl->gl_name.ln_type;
+                __entry->glnum          = gl->gl_name.ln_number;
+                __entry->cur_state      = glock_trace_state(gl->gl_state);
+                __entry->dmt_state      = glock_trace_state(gl->gl_demote_state);
+                __entry->flags          = gl->gl_flags;
+        ),
+        TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+                  (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+                  glock_trace_name(__entry->dmt_state),
+                  show_glock_flags(__entry->flags))
+);
+/* Promotion/grant of a glock */
+TRACE_EVENT(gfs2_promote,
+        TP_PROTO(const struct gfs2_holder *gh, int first),
+        TP_ARGS(gh, first),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        int,    first                   )
+                __field(        u8,     state                   )
+        ),
+        TP_fast_assign(
+                __entry->dev    = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->glnum  = gh->gh_gl->gl_name.ln_number;
+                __entry->gltype = gh->gh_gl->gl_name.ln_type;
+                __entry->first  = first;
+                __entry->state  = glock_trace_state(gh->gh_state);
+        ),
+        TP_printk("%u,%u glock %u:%llu promote %s %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+                  (unsigned long long)__entry->glnum,
+                  __entry->first ? "first": "other",
+                  glock_trace_name(__entry->state))
+);
+/* Queue/dequeue a lock request */
+TRACE_EVENT(gfs2_glock_queue,
+        TP_PROTO(const struct gfs2_holder *gh, int queue),
+        TP_ARGS(gh, queue),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        int,    queue                   )
+                __field(        u8,     state                   )
+        ),
+        TP_fast_assign(
+                __entry->dev    = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->glnum  = gh->gh_gl->gl_name.ln_number;
+                __entry->gltype = gh->gh_gl->gl_name.ln_type;
+                __entry->queue  = queue;
+                __entry->state  = glock_trace_state(gh->gh_state);
+        ),
+        TP_printk("%u,%u glock %u:%llu %squeue %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+                  (unsigned long long)__entry->glnum,
+                  __entry->queue ? "" : "de",
+                  glock_trace_name(__entry->state))
+);
+/* Section 2 - Log/journal
+ *
+ * Objectives:
+ * Latency: Log flush time
+ * Correctness: pin/unpin vs. disk I/O ordering
+ * Performance: Log usage stats
+ */
+/* Pin/unpin a block in the log */
+TRACE_EVENT(gfs2_pin,
+        TP_PROTO(const struct gfs2_bufdata *bd, int pin),
+        TP_ARGS(bd, pin),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        int,    pin                     )
+                __field(        u32,    len                     )
+                __field(        sector_t,       block           )
+                __field(        u64,    ino                     )
+        ),
+        TP_fast_assign(
+                __entry->dev            = bd->bd_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->pin            = pin;
+                __entry->len            = bd->bd_bh->b_size;
+                __entry->block          = bd->bd_bh->b_blocknr;
+                __entry->ino            = bd->bd_gl->gl_name.ln_number;
+        ),
+        TP_printk("%u,%u log %s %llu/%lu inode %llu",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->pin ? "pin" : "unpin",
+                  (unsigned long long)__entry->block,
+                  (unsigned long)__entry->len,
+                  (unsigned long long)__entry->ino)
+);
+/* Flushing the log */
+TRACE_EVENT(gfs2_log_flush,
+        TP_PROTO(const struct gfs2_sbd *sdp, int start),
+        TP_ARGS(sdp, start),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        int,    start                   )
+                __field(        u64,    log_seq                 )
+        ),
+        TP_fast_assign(
+                __entry->dev            = sdp->sd_vfs->s_dev;
+                __entry->start          = start;
+                __entry->log_seq        = sdp->sd_log_sequence;
+        ),
+        TP_printk("%u,%u log flush %s %llu",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->start ? "start" : "end",
+                  (unsigned long long)__entry->log_seq)
+);
+/* Reserving/releasing blocks in the log */
+TRACE_EVENT(gfs2_log_blocks,
+        TP_PROTO(const struct gfs2_sbd *sdp, int blocks),
+        TP_ARGS(sdp, blocks),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        int,    blocks                  )
+        ),
+        TP_fast_assign(
+                __entry->dev            = sdp->sd_vfs->s_dev;
+                __entry->blocks         = blocks;
+        ),
+        TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev),
+                  MINOR(__entry->dev), __entry->blocks)
+);
+/* Section 3 - bmap
+ *
+ * Objectives:
+ * Latency: Bmap request time
+ * Performance: Block allocator tracing
+ * Correctness: Test of disard generation vs. blocks allocated
+ */
+/* Map an extent of blocks, possibly a new allocation */
+TRACE_EVENT(gfs2_bmap,
+        TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh,
+                sector_t lblock, int create, int errno),
+        TP_ARGS(ip, bh, lblock, create, errno),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        sector_t, lblock                )
+                __field(        sector_t, pblock                )
+                __field(        u64,    inum                    )
+                __field(        unsigned long, state            )
+                __field(        u32,    len                     )
+                __field(        int,    create                  )
+                __field(        int,    errno                   )
+        ),
+        TP_fast_assign(
+                __entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->lblock         = lblock;
+                __entry->pblock         = buffer_mapped(bh) ?  bh->b_blocknr : 0;
+                __entry->inum           = ip->i_no_addr;
+                __entry->state          = bh->b_state;
+                __entry->len            = bh->b_size;
+                __entry->create         = create;
+                __entry->errno          = errno;
+        ),
+        TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  (unsigned long long)__entry->inum,
+                  (unsigned long long)__entry->lblock,
+                  (unsigned long)__entry->len,
+                  (unsigned long long)__entry->pblock,
+                  __entry->state, __entry->create ? "create " : "nocreate",
+                  __entry->errno)
+);
+/* Keep track of blocks as they are allocated/freed */
+TRACE_EVENT(gfs2_block_alloc,
+        TP_PROTO(const struct gfs2_inode *ip, u64 block, unsigned len,
+                u8 block_state),
+        TP_ARGS(ip, block, len, block_state),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    start                   )
+                __field(        u64,    inum                    )
+                __field(        u32,    len                     )
+                __field(        u8,     block_state             )
+        ),
+        TP_fast_assign(
+                __entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->start          = block;
+                __entry->inum           = ip->i_no_addr;
+                __entry->len            = len;
+                __entry->block_state    = block_state;
+        ),
+        TP_printk("%u,%u bmap %llu alloc %llu/%lu %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  (unsigned long long)__entry->inum,
+                  (unsigned long long)__entry->start,
+                  (unsigned long)__entry->len,
+                  block_state_name(__entry->block_state))
+);
+#endif /* _TRACE_GFS2_H */
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_gfs2
+#include <trace/define_trace.h>
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 053752d4b27f..4ef0e9fa3549 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
        BUG_ON(current->journal_info);
        BUG_ON(blocks == 0 && revokes == 0);
+        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+                return -EROFS;
        tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
        if (!tr)
                return -ENOMEM;
@@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
        if (error)
                goto fail_holder_uninit;
-        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                tr->tr_t_gh.gh_flags |= GL_NOCACHE;
-                error = -EROFS;
-                goto fail_gunlock;
-        }
        error = gfs2_log_reserve(sdp, tr->tr_reserved);
        if (error)
                goto fail_gunlock;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index a36bb749926d..f7fcbe49da72 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
 #include <linux/nls.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include "hfs_fs.h"
@@ -49,11 +50,23 @@ MODULE_LICENSE("GPL");
 */
 static void hfs_write_super(struct super_block *sb)
 {
+        lock_super(sb);
        sb->s_dirt = 0;
-        if (sb->s_flags & MS_RDONLY)
-                return;
        /* sync everything to the buffers */
+        if (!(sb->s_flags & MS_RDONLY))
+                hfs_mdb_commit(sb);
+        unlock_super(sb);
+}
+static int hfs_sync_fs(struct super_block *sb, int wait)
+{
+        lock_super(sb);
        hfs_mdb_commit(sb);
+        sb->s_dirt = 0;
+        unlock_super(sb);
+        return 0;
 }
 /*
@@ -65,9 +78,15 @@ static void hfs_write_super(struct super_block *sb)
 */
 static void hfs_put_super(struct super_block *sb)
 {
+        lock_kernel();
+        if (sb->s_dirt)
+                hfs_write_super(sb);
        hfs_mdb_close(sb);
        /* release the MDB's resources */
        hfs_mdb_put(sb);
+        unlock_kernel();
 }
 /*
@@ -164,6 +183,7 @@ static const struct super_operations hfs_super_operations = {
        .clear_inode    = hfs_clear_inode,
        .put_super      = hfs_put_super,
        .write_super    = hfs_write_super,
+        .sync_fs        = hfs_sync_fs,
        .statfs         = hfs_statfs,
        .remount_fs     = hfs_remount,
        .show_options   = hfs_show_options,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index f2a64020f42e..c0759fe0855b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/nls.h>
@@ -152,15 +153,14 @@ static void hfsplus_clear_inode(struct inode *inode)
        }
 }
-static void hfsplus_write_super(struct super_block *sb)
+static int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
        struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
        dprint(DBG_SUPER, "hfsplus_write_super\n");
+        lock_super(sb);
        sb->s_dirt = 0;
-        if (sb->s_flags & MS_RDONLY)
-                /* warn? */
-                return;
        vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
        vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -192,6 +192,16 @@ static void hfsplus_write_super(struct super_block *sb)
                }
                HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
        }
+        unlock_super(sb);
+        return 0;
+}
+static void hfsplus_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                hfsplus_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static void hfsplus_put_super(struct super_block *sb)
@@ -199,6 +209,11 @@ static void hfsplus_put_super(struct super_block *sb)
        dprint(DBG_SUPER, "hfsplus_put_super\n");
        if (!sb->s_fs_info)
                return;
+        lock_kernel();
+        if (sb->s_dirt)
+                hfsplus_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
                struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
@@ -218,6 +233,8 @@ static void hfsplus_put_super(struct super_block *sb)
                unload_nls(HFSPLUS_SB(sb).nls);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -279,6 +296,7 @@ static const struct super_operations hfsplus_sops = {
        .clear_inode    = hfsplus_clear_inode,
        .put_super      = hfsplus_put_super,
        .write_super    = hfsplus_write_super,
+        .sync_fs        = hfsplus_sync_fs,
        .statfs         = hfsplus_statfs,
        .remount_fs     = hfsplus_remount,
        .show_options   = hfsplus_show_options,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fe02ad4740e7..032604e5ef2c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -972,6 +972,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = HOSTFS_SUPER_MAGIC;
        sb->s_op = &hostfs_sbops;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
        /* NULL is printed as <NULL> by sprintf: avoid that. */
        if (req_root == NULL)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 6916c41d7017..8865c94f55f6 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -6,6 +6,7 @@
 *  directory VFS functions
 */
+#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 64ab52259204..3efabff00367 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -6,6 +6,7 @@
 *  file VFS functions
 */
+#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 #define BLOCKS(size) (((size) + 511) >> 9)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index c2ea31bae313..701ca54c0867 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -13,7 +13,6 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include "hpfs.h"
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 39a1bfbea312..fe703ae46bc7 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -6,6 +6,7 @@
 *  inode VFS functions
 */
+#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index b649232dde97..82b9c4ba9ed0 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -6,6 +6,7 @@
 *  adding & removing files & directories
 */
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fc77965be841..f2feaa06bf26 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
@@ -99,11 +100,16 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
 static void hpfs_put_super(struct super_block *s)
 {
        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        lock_kernel();
        kfree(sbi->sb_cp_table);
        kfree(sbi->sb_bmp_dir);
        unmark_dirty(s);
        s->s_fs_info = NULL;
        kfree(sbi);
+        unlock_kernel();
 }
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -393,6 +399,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        
        *flags |= MS_NOATIME;
        
+        lock_kernel();
+        lock_super(s);
        uid = sbi->sb_uid; gid = sbi->sb_gid;
        umask = 0777 & ~sbi->sb_mode;
        lowercase = sbi->sb_lowercase; conv = sbi->sb_conv;
@@ -425,9 +433,13 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        replace_mount_options(s, new_opts);
+        unlock_super(s);
+        unlock_kernel();
        return 0;
 out_err:
+        unlock_super(s);
+        unlock_kernel();
        kfree(new_opts);
        return -EINVAL;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c1462d43e721..cb88dac8ccaa 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,6 +30,7 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <asm/uaccess.h>
@@ -934,26 +935,28 @@ static int can_do_hugetlb_shm(void)
        return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+                                                struct user_struct **user)
 {
        int error = -ENOMEM;
-        int unlock_shm = 0;
        struct file *file;
        struct inode *inode;
        struct dentry *dentry, *root;
        struct qstr quick_string;
-        struct user_struct *user = current_user();
+        *user = NULL;
        if (!hugetlbfs_vfsmount)
                return ERR_PTR(-ENOENT);
        if (!can_do_hugetlb_shm()) {
-                if (user_shm_lock(size, user)) {
+                *user = current_user();
-                        unlock_shm = 1;
+                if (user_shm_lock(size, *user)) {
                        WARN_ONCE(1,
                          "Using mlock ulimits for SHM_HUGETLB deprecated\n");
-                } else
+                } else {
+                        *user = NULL;
                        return ERR_PTR(-EPERM);
+                }
        }
        root = hugetlbfs_vfsmount->mnt_root;
@@ -986,6 +989,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
                        &hugetlbfs_file_operations);
        if (!file)
                goto out_dentry; /* inode is already attached */
+        ima_counts_get(file);
        return file;
@@ -994,8 +998,10 @@ out_inode:
 out_dentry:
        dput(dentry);
 out_shm_unlock:
-        if (unlock_shm)
+        if (*user) {
-                user_shm_unlock(size, user);
+                user_shm_unlock(size, *user);
+                *user = NULL;
+        }
        return ERR_PTR(error);
 }
diff --git a/fs/inode.c b/fs/inode.c
index bca0c618fdb3..ae7b67e48661 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,8 +22,10 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
+#include <linux/posix_acl.h>
 /*
 * This is needed for the following functions:
@@ -118,12 +120,11 @@ static void wake_up_inode(struct inode *inode)
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 */
-struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
+int inode_init_always(struct super_block *sb, struct inode *inode)
 {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
        struct address_space *const mapping = &inode->i_data;
        inode->i_sb = sb;
@@ -150,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->dirtied_when = 0;
        if (security_inode_alloc(inode))
-                goto out_free_inode;
+                goto out;
        /* allocate and initialize an i_integrity */
        if (ima_inode_alloc(inode))
@@ -188,17 +189,20 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        }
        inode->i_private = NULL;
        inode->i_mapping = mapping;
+#ifdef CONFIG_FS_POSIX_ACL
+        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+#endif
-        return inode;
+#ifdef CONFIG_FSNOTIFY
+        inode->i_fsnotify_mask = 0;
+#endif
+        return 0;
 out_free_security:
        security_inode_free(inode);
-out_free_inode:
+out:
-        if (inode->i_sb->s_op->destroy_inode)
+        return -ENOMEM;
-                inode->i_sb->s_op->destroy_inode(inode);
-        else
-                kmem_cache_free(inode_cachep, (inode));
-        return NULL;
 }
 EXPORT_SYMBOL(inode_init_always);
@@ -211,23 +215,43 @@ static struct inode *alloc_inode(struct super_block *sb)
        else
                inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
-        if (inode)
+        if (!inode)
-                return inode_init_always(sb, inode);
+                return NULL;
-        return NULL;
+        if (unlikely(inode_init_always(sb, inode))) {
+                if (inode->i_sb->s_op->destroy_inode)
+                        inode->i_sb->s_op->destroy_inode(inode);
+                else
+                        kmem_cache_free(inode_cachep, inode);
+                return NULL;
+        }
+        return inode;
 }
-void destroy_inode(struct inode *inode)
+void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
        ima_inode_free(inode);
        security_inode_free(inode);
+        fsnotify_inode_delete(inode);
+#ifdef CONFIG_FS_POSIX_ACL
+        if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
+                posix_acl_release(inode->i_acl);
+        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
+                posix_acl_release(inode->i_default_acl);
+#endif
+}
+EXPORT_SYMBOL(__destroy_inode);
+void destroy_inode(struct inode *inode)
+{
+        __destroy_inode(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
                kmem_cache_free(inode_cachep, (inode));
 }
-EXPORT_SYMBOL(destroy_inode);
 /*
 * These are initializations that only need to be done
@@ -252,6 +276,9 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->inotify_watches);
        mutex_init(&inode->inotify_mutex);
 #endif
+#ifdef CONFIG_FSNOTIFY
+        INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+#endif
 }
 EXPORT_SYMBOL(inode_init_once);
@@ -398,6 +425,7 @@ int invalidate_inodes(struct super_block *sb)
        mutex_lock(&iprune_mutex);
        spin_lock(&inode_lock);
        inotify_unmount_inodes(&sb->s_inodes);
+        fsnotify_unmount_inodes(&sb->s_inodes);
        busy = invalidate_list(&sb->s_inodes, &throw_away);
        spin_unlock(&inode_lock);
@@ -655,12 +683,17 @@ void unlock_new_inode(struct inode *inode)
        if (inode->i_mode & S_IFDIR) {
                struct file_system_type *type = inode->i_sb->s_type;
-                /*
+                /* Set new key only if filesystem hasn't already changed it */
-                 * ensure nobody is actually holding i_mutex
+                if (!lockdep_match_class(&inode->i_mutex,
-                 */
+                    &type->i_mutex_key)) {
-                mutex_destroy(&inode->i_mutex);
+                        /*
-                mutex_init(&inode->i_mutex);
+                         * ensure nobody is actually holding i_mutex
-                lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key);
+                         */
+                        mutex_destroy(&inode->i_mutex);
+                        mutex_init(&inode->i_mutex);
+                        lockdep_set_class(&inode->i_mutex,
+                                          &type->i_mutex_dir_key);
+                }
        }
 #endif
        /*
@@ -1398,7 +1431,7 @@ EXPORT_SYMBOL(touch_atime);
 *      for writeback.  Note that this function is meant exclusively for
 *      usage in the file write path of filesystems, and filesystems may
 *      choose to explicitly ignore update via this function with the
- *      S_NOCTIME inode flag, e.g. for network filesystem where these
+ *      S_NOCMTIME inode flag, e.g. for network filesystem where these
 *      timestamps are handled by the server.
 */
@@ -1412,7 +1445,7 @@ void file_update_time(struct file *file)
        if (IS_NOCMTIME(inode))
                return;
-        err = mnt_want_write(file->f_path.mnt);
+        err = mnt_want_write_file(file);
        if (err)
                return;
diff --git a/fs/internal.h b/fs/internal.h
index b4dac4fb6b61..d55ef562f0bb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
        return sb == blockdev_superblock;
 }
+extern int __sync_blockdev(struct block_device *bdev, int wait);
 #else
 static inline void bdev_cache_init(void)
 {
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 {
        return 0;
 }
+static inline int __sync_blockdev(struct block_device *bdev, int wait)
+{
+        return 0;
+}
 #endif
 /*
@@ -66,3 +73,13 @@ extern void __init mnt_init(void);
 * fs_struct.c
 */
 extern void chroot_fs_refs(struct path *, struct path *);
+/*
+ * file_table.c
+ */
+extern void mark_files_ro(struct super_block *);
+/*
+ * super.c
+ */
+extern int do_remount_sb(struct super_block *, int, void *, int);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 82d9c42b8bac..5612880fcbe7 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
+#include <linux/falloc.h>
 #include <asm/ioctls.h>
@@ -70,9 +71,7 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
        res = get_user(block, p);
        if (res)
                return res;
-        lock_kernel();
        res = mapping->a_ops->bmap(mapping, block);
-        unlock_kernel();
        return put_user(res, p);
 }
@@ -405,6 +404,37 @@ EXPORT_SYMBOL(generic_block_fiemap);
 #endif  /*  CONFIG_BLOCK  */
+/*
+ * This provides compatibility with legacy XFS pre-allocation ioctls
+ * which predate the fallocate syscall.
+ *
+ * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
+ * are used here, rest are ignored.
+ */
+int ioctl_preallocate(struct file *filp, void __user *argp)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct space_resv sr;
+        if (copy_from_user(&sr, argp, sizeof(sr)))
+                return -EFAULT;
+        switch (sr.l_whence) {
+        case SEEK_SET:
+                break;
+        case SEEK_CUR:
+                sr.l_start += filp->f_pos;
+                break;
+        case SEEK_END:
+                sr.l_start += i_size_read(inode);
+                break;
+        default:
+                return -EINVAL;
+        }
+        return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+}
 static int file_ioctl(struct file *filp, unsigned int cmd,
                unsigned long arg)
 {
@@ -414,12 +444,11 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
-        case FS_IOC_FIEMAP:
-                return ioctl_fiemap(filp, arg);
-        case FIGETBSZ:
-                return put_user(inode->i_sb->s_blocksize, p);
        case FIONREAD:
                return put_user(i_size_read(inode) - filp->f_pos, p);
+        case FS_IOC_RESVSP:
+        case FS_IOC_RESVSP64:
+                return ioctl_preallocate(filp, p);
        }
        return vfs_ioctl(filp, cmd, arg);
@@ -557,6 +586,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
+        case FS_IOC_FIEMAP:
+                return ioctl_fiemap(filp, arg);
+        case FIGETBSZ:
+        {
+                struct inode *inode = filp->f_path.dentry->d_inode;
+                int __user *p = (int __user *)arg;
+                return put_user(inode->i_sb->s_blocksize, p);
+        }
        default:
                if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
                        error = file_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 2f0dc5a14633..8ba5441063be 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -195,9 +195,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                 * Do not report hidden files if so instructed, or associated
                 * files unless instructed to do so
                 */
-                if ((sbi->s_hide == 'y' &&
+                if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
-                                (de->flags[-sbi->s_high_sierra] & 1)) ||
+                    (!sbi->s_showassoc &&
-                                (sbi->s_showassoc =='n' &&
                                (de->flags[-sbi->s_high_sierra] & 4))) {
                        filp->f_pos += de_len;
                        continue;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index b4cbe9603c7d..85f96bc651c7 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -42,11 +42,16 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst
 static void isofs_put_super(struct super_block *sb)
 {
        struct isofs_sb_info *sbi = ISOFS_SB(sb);
 #ifdef CONFIG_JOLIET
+        lock_kernel();
        if (sbi->s_nls_iocharset) {
                unload_nls(sbi->s_nls_iocharset);
                sbi->s_nls_iocharset = NULL;
        }
+        unlock_kernel();
 #endif
        kfree(sbi);
@@ -136,13 +141,17 @@ static const struct dentry_operations isofs_dentry_ops[] = {
 };
 struct iso9660_options{
-        char map;
+        unsigned int rock:1;
-        char rock;
+        unsigned int joliet:1;
-        char joliet;
+        unsigned int cruft:1;
-        char cruft;
+        unsigned int hide:1;
-        char hide;
+        unsigned int showassoc:1;
-        char showassoc;
+        unsigned int nocompress:1;
-        char nocompress;
+        unsigned int overriderockperm:1;
+        unsigned int uid_set:1;
+        unsigned int gid_set:1;
+        unsigned int utf8:1;
+        unsigned char map;
        unsigned char check;
        unsigned int blocksize;
        mode_t fmode;
@@ -150,7 +159,6 @@ struct iso9660_options{
        gid_t gid;
        uid_t uid;
        char *iocharset;
-        unsigned char utf8;
        /* LVE */
        s32 session;
        s32 sbsector;
@@ -307,7 +315,7 @@ enum {
        Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
        Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
        Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
-        Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode,
+        Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
 };
 static const match_table_t tokens = {
@@ -335,6 +343,7 @@ static const match_table_t tokens = {
        {Opt_gid, "gid=%u"},
        {Opt_mode, "mode=%u"},
        {Opt_dmode, "dmode=%u"},
+        {Opt_overriderockperm, "overriderockperm"},
        {Opt_block, "block=%u"},
        {Opt_ignore, "conv=binary"},
        {Opt_ignore, "conv=b"},
@@ -354,24 +363,22 @@ static int parse_options(char *options, struct iso9660_options *popt)
        int option;
        popt->map = 'n';
-        popt->rock = 'y';
+        popt->rock = 1;
-        popt->joliet = 'y';
+        popt->joliet = 1;
-        popt->cruft = 'n';
+        popt->cruft = 0;
-        popt->hide = 'n';
+        popt->hide = 0;
-        popt->showassoc = 'n';
+        popt->showassoc = 0;
        popt->check = 'u';              /* unset */
        popt->nocompress = 0;
        popt->blocksize = 1024;
-        popt->fmode = popt->dmode = S_IRUGO | S_IXUGO; /*
+        popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
-                                         * r-x for all.  The disc could
+        popt->uid_set = 0;
-                                         * be shared with DOS machines so
+        popt->gid_set = 0;
-                                         * virtually anything could be
-                                         * a valid executable.
-                                         */
        popt->gid = 0;
        popt->uid = 0;
        popt->iocharset = NULL;
        popt->utf8 = 0;
+        popt->overriderockperm = 0;
        popt->session=-1;
        popt->sbsector=-1;
        if (!options)
@@ -388,20 +395,20 @@ static int parse_options(char *options, struct iso9660_options *popt)
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_norock:
-                        popt->rock = 'n';
+                        popt->rock = 0;
                        break;
                case Opt_nojoliet:
-                        popt->joliet = 'n';
+                        popt->joliet = 0;
                        break;
                case Opt_hide:
-                        popt->hide = 'y';
+                        popt->hide = 1;
                        break;
                case Opt_unhide:
                case Opt_showassoc:
-                        popt->showassoc = 'y';
+                        popt->showassoc = 1;
                        break;
                case Opt_cruft:
-                        popt->cruft = 'y';
+                        popt->cruft = 1;
                        break;
                case Opt_utf8:
                        popt->utf8 = 1;
@@ -445,11 +452,13 @@ static int parse_options(char *options, struct iso9660_options *popt)
                        if (match_int(&args[0], &option))
                                return 0;
                        popt->uid = option;
+                        popt->uid_set = 1;
                        break;
                case Opt_gid:
                        if (match_int(&args[0], &option))
                                return 0;
                        popt->gid = option;
+                        popt->gid_set = 1;
                        break;
                case Opt_mode:
                        if (match_int(&args[0], &option))
@@ -461,6 +470,9 @@ static int parse_options(char *options, struct iso9660_options *popt)
                                return 0;
                        popt->dmode = option;
                        break;
+                case Opt_overriderockperm:
+                        popt->overriderockperm = 1;
+                        break;
                case Opt_block:
                        if (match_int(&args[0], &option))
                                return 0;
@@ -620,7 +632,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
                        else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
                                sec = (struct iso_supplementary_descriptor *)vdp;
                                if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
-                                        if (opt.joliet == 'y') {
+                                        if (opt.joliet) {
                                                if (sec->escape[2] == 0x40)
                                                        joliet_level = 1;
                                                else if (sec->escape[2] == 0x43)
@@ -645,7 +657,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
                                        goto out_freebh;
                                sbi->s_high_sierra = 1;
-                                opt.rock = 'n';
+                                opt.rock = 0;
                                h_pri = (struct hs_primary_descriptor *)vdp;
                                goto root_found;
                        }
@@ -668,7 +680,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 root_found:
-        if (joliet_level && (pri == NULL || opt.rock == 'n')) {
+        if (joliet_level && (pri == NULL || !opt.rock)) {
                /* This is the case of Joliet with the norock mount flag.
                 * A disc with both Joliet and Rock Ridge is handled later
                 */
@@ -797,22 +809,31 @@ root_found:
        s->s_op = &isofs_sops;
        s->s_export_op = &isofs_export_ops;
        sbi->s_mapping = opt.map;
-        sbi->s_rock = (opt.rock == 'y' ? 2 : 0);
+        sbi->s_rock = (opt.rock ? 2 : 0);
        sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
        sbi->s_cruft = opt.cruft;
        sbi->s_hide = opt.hide;
        sbi->s_showassoc = opt.showassoc;
        sbi->s_uid = opt.uid;
        sbi->s_gid = opt.gid;
+        sbi->s_uid_set = opt.uid_set;
+        sbi->s_gid_set = opt.gid_set;
        sbi->s_utf8 = opt.utf8;
        sbi->s_nocompress = opt.nocompress;
+        sbi->s_overriderockperm = opt.overriderockperm;
        /*
         * It would be incredibly stupid to allow people to mark every file
         * on the disk as suid, so we merely allow them to set the default
         * permissions.
         */
-        sbi->s_fmode = opt.fmode & 0777;
+        if (opt.fmode != ISOFS_INVALID_MODE)
-        sbi->s_dmode = opt.dmode & 0777;
+                sbi->s_fmode = opt.fmode & 0777;
+        else
+                sbi->s_fmode = ISOFS_INVALID_MODE;
+        if (opt.dmode != ISOFS_INVALID_MODE)
+                sbi->s_dmode = opt.dmode & 0777;
+        else
+                sbi->s_dmode = ISOFS_INVALID_MODE;
        /*
         * Read the root inode, which _may_ result in changing
@@ -1090,18 +1111,6 @@ static const struct address_space_operations isofs_aops = {
        .bmap = _isofs_bmap
 };
-static inline void test_and_set_uid(uid_t *p, uid_t value)
-{
-        if (value)
-                *p = value;
-}
-static inline void test_and_set_gid(gid_t *p, gid_t value)
-{
-        if (value)
-                *p = value;
-}
 static int isofs_read_level3_size(struct inode *inode)
 {
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -1256,7 +1265,10 @@ static int isofs_read_inode(struct inode *inode)
        ei->i_file_format = isofs_file_normal;
        if (de->flags[-high_sierra] & 2) {
-                inode->i_mode = sbi->s_dmode | S_IFDIR;
+                if (sbi->s_dmode != ISOFS_INVALID_MODE)
+                        inode->i_mode = S_IFDIR | sbi->s_dmode;
+                else
+                        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
                inode->i_nlink = 1;     /*
                                         * Set to 1.  We know there are 2, but
                                         * the find utility tries to optimize
@@ -1265,8 +1277,16 @@ static int isofs_read_inode(struct inode *inode)
                                         * do it the hard way.
                                         */
        } else {
-                /* Everybody gets to read the file. */
+                if (sbi->s_fmode != ISOFS_INVALID_MODE) {
-                inode->i_mode = sbi->s_fmode | S_IFREG;
+                        inode->i_mode = S_IFREG | sbi->s_fmode;
+                } else {
+                        /*
+                         * Set default permissions: r-x for all.  The disc
+                         * could be shared with DOS machines so virtually
+                         * anything could be a valid executable.
+                         */
+                        inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
+                }
                inode->i_nlink = 1;
        }
        inode->i_uid = sbi->s_uid;
@@ -1295,7 +1315,7 @@ static int isofs_read_inode(struct inode *inode)
         * this CDROM was mounted with the cruft option.
         */
-        if (sbi->s_cruft == 'y')
+        if (sbi->s_cruft)
                inode->i_size &= 0x00ffffff;
        if (de->interleave[0]) {
@@ -1341,9 +1361,18 @@ static int isofs_read_inode(struct inode *inode)
        if (!high_sierra) {
                parse_rock_ridge_inode(de, inode);
                /* if we want uid/gid set, override the rock ridge setting */
-                test_and_set_uid(&inode->i_uid, sbi->s_uid);
+                if (sbi->s_uid_set)
-                test_and_set_gid(&inode->i_gid, sbi->s_gid);
+                        inode->i_uid = sbi->s_uid;
+                if (sbi->s_gid_set)
+                        inode->i_gid = sbi->s_gid;
        }
+        /* Now set final access rights if overriding rock ridge setting */
+        if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm &&
+            sbi->s_dmode != ISOFS_INVALID_MODE)
+                inode->i_mode = S_IFDIR | sbi->s_dmode;
+        if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm &&
+            sbi->s_fmode != ISOFS_INVALID_MODE)
+                inode->i_mode = S_IFREG | sbi->s_fmode;
        /* Install the inode operations vector */
        if (S_ISREG(inode->i_mode)) {
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index ccbf72faf27a..7d33de84f52a 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -35,21 +35,20 @@ struct isofs_sb_info {
        unsigned long s_log_zone_size;
        unsigned long s_max_size;
        
-        unsigned char s_high_sierra; /* A simple flag */
-        unsigned char s_mapping;
        int           s_rock_offset; /* offset of SUSP fields within SU area */
-        unsigned char s_rock;
        unsigned char s_joliet_level;
-        unsigned char s_utf8;
+        unsigned char s_mapping;
-        unsigned char s_cruft; /* Broken disks with high
+        unsigned int  s_high_sierra:1;
-                                  byte of length containing
+        unsigned int  s_rock:2;
-                                  junk */
+        unsigned int  s_utf8:1;
-        unsigned char s_unhide;
+        unsigned int  s_cruft:1; /* Broken disks with high byte of length
-        unsigned char s_nosuid;
+                                  * containing junk */
-        unsigned char s_nodev;
+        unsigned int  s_nocompress:1;
-        unsigned char s_nocompress;
+        unsigned int  s_hide:1;
-        unsigned char s_hide;
+        unsigned int  s_showassoc:1;
-        unsigned char s_showassoc;
+        unsigned int  s_overriderockperm:1;
+        unsigned int  s_uid_set:1;
+        unsigned int  s_gid_set:1;
        mode_t s_fmode;
        mode_t s_dmode;
@@ -58,6 +57,8 @@ struct isofs_sb_info {
        struct nls_table *s_nls_iocharset; /* Native language support table */
 };
+#define ISOFS_INVALID_MODE ((mode_t) -1)
 static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 92c14b850e9c..a048de81c093 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
        return (op - ascii);
 }
-/* Convert big endian wide character string to utf8 */
-static int
-wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
-{
-        const __u8 *ip;
-        __u8 *op;
-        int size;
-        __u16 c;
-        op = s;
-        ip = pwcs;
-        while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
-                c = (*ip << 8) | ip[1];
-                if (c > 0x7f) {
-                        size = utf8_wctomb(op, c, maxlen);
-                        if (size == -1) {
-                                /* Ignore character and move on */
-                                maxlen--;
-                        } else {
-                                op += size;
-                                maxlen -= size;
-                        }
-                } else {
-                        *op++ = (__u8) c;
-                }
-                ip += 2;
-                inlen--;
-        }
-        return (op - s);
-}
 int
 get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
 {
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
        nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
        if (utf8) {
-                len = wcsntombs_be(outname, de->name,
+                len = utf16s_to_utf8s((const wchar_t *) de->name,
-                                de->name_len[0] >> 1, PAGE_SIZE);
+                                de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
+                                outname, PAGE_SIZE);
        } else {
                len = uni16_to_x8(outname, (__be16 *) de->name,
                                de->name_len[0] >> 1, nls);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 8299889a835e..eaa831311c9c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -142,9 +142,9 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
                 */
                match = 0;
                if (dlen > 0 &&
-                        (sbi->s_hide =='n' ||
+                        (!sbi->s_hide ||
                                (!(de->flags[-sbi->s_high_sierra] & 1))) &&
-                        (sbi->s_showassoc =='y' ||
+                        (sbi->s_showassoc ||
                                (!(de->flags[-sbi->s_high_sierra] & 4)))) {
                        match = (isofs_cmp(dentry, dpnt, dlen) == 0);
                }
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b5..f96f85092d1c 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
+        journal_t *journal = transaction->t_journal;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+        /* keep subsequent assertions sane */
+        new_bh->b_state = 0;
+        init_buffer(new_bh, NULL, NULL);
+        atomic_set(&new_bh->b_count, 1);
+        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        /*
         * If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
                kunmap_atomic(mapped_data, KM_USER0);
        }
-        /* keep subsequent assertions sane */
-        new_bh->b_state = 0;
-        init_buffer(new_bh, NULL, NULL);
-        atomic_set(&new_bh->b_count, 1);
-        jbd_unlock_bh_state(bh_in);
-        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        set_bh_page(new_bh, new_page, new_offset);
        new_jh->b_transaction = NULL;
        new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-        journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_lock(&journal->j_list_lock);
+        __journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh_in);
        JBUFFER_TRACE(new_jh, "file as BJ_IO");
        journal_file_buffer(new_jh, transaction, BJ_IO);
@@ -848,6 +850,12 @@ static int journal_reset(journal_t *journal)
        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
+        if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
+                printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+                       first, last);
+                journal_fail_superblock(journal);
+                return -EINVAL;
+        }
        journal->j_first = first;
        journal->j_last = last;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index ed886e6db399..c03ac11f74be 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
-                struct buffer_head *bh = jh2bh(jh);
-                if (test_clear_buffer_dirty(bh))
+        printk(KERN_WARNING
-                        set_buffer_jbddirty(bh);
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-        }
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 /*
@@ -583,14 +564,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1686,35 +1678,6 @@ out:
        return;
 }
-/*
- * journal_try_to_free_buffers() could race with journal_commit_transaction()
- * The latter might still hold the a count on buffers when inspecting
- * them on t_syncdata_list or t_locked_list.
- *
- * journal_try_to_free_buffers() will call this function to
- * wait for the current transaction to finish syncing data buffers, before
- * tryinf to free that buffer.
- *
- * Called with journal->j_state_lock held.
- */
-static void journal_wait_for_transaction_sync_data(journal_t *journal)
-{
-        transaction_t *transaction = NULL;
-        tid_t tid;
-        spin_lock(&journal->j_state_lock);
-        transaction = journal->j_committing_transaction;
-        if (!transaction) {
-                spin_unlock(&journal->j_state_lock);
-                return;
-        }
-        tid = transaction->t_tid;
-        spin_unlock(&journal->j_state_lock);
-        log_wait_commit(journal, tid);
-}
 /**
 * int journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
@@ -1786,25 +1749,6 @@ int journal_try_to_free_buffers(journal_t *journal,
        ret = try_to_free_buffers(page);
-        /*
-         * There are a number of places where journal_try_to_free_buffers()
-         * could race with journal_commit_transaction(), the later still
-         * holds the reference to the buffers to free while processing them.
-         * try_to_free_buffers() failed to free those buffers. Some of the
-         * caller of releasepage() request page buffers to be dropped, otherwise
-         * treat the fail-to-free as errors (such as generic_file_direct_IO())
-         *
-         * So, if the caller of try_to_release_page() wants the synchronous
-         * behaviour(i.e make sure buffers are dropped upon return),
-         * let's wait for the current transaction to finish flush of
-         * dirty data buffers, then try to free those buffers again,
-         * with the journal locked.
-         */
-        if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
-                journal_wait_for_transaction_sync_data(journal);
-                ret = try_to_free_buffers(page);
-        }
 busy:
        return ret;
 }
@@ -1830,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -2089,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 17159cacbd9e..5d70b3e6d49b 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,9 +20,9 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <trace/events/jbd2.h>
 /*
 * Unlink a buffer from a transaction checkpoint list.
@@ -358,8 +358,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
         * journal straight away.
         */
        result = jbd2_cleanup_journal_tail(journal);
-        trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
+        trace_jbd2_checkpoint(journal, result);
-                   journal->j_devname, result);
        jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
        if (result <= 0)
                return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0b7d3b8226fd..7b4088b2364d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,7 +16,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -26,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
+#include <trace/events/jbd2.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -253,6 +253,7 @@ static int journal_submit_data_buffers(journal_t *journal,
                 * block allocation  with delalloc. We need to write
                 * only allocated blocks here.
                 */
+                trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
                err = journal_submit_inode_data_buffers(mapping);
                if (!ret)
                        ret = err;
@@ -394,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        commit_transaction = journal->j_running_transaction;
        J_ASSERT(commit_transaction->t_state == T_RUNNING);
-        trace_mark(jbd2_start_commit, "dev %s transaction %d",
+        trace_jbd2_start_commit(journal, commit_transaction);
-                   journal->j_devname, commit_transaction->t_tid);
        jbd_debug(1, "JBD: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
@@ -409,6 +409,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         */
        if (commit_transaction->t_synchronous_commit)
                write_op = WRITE_SYNC_PLUG;
+        trace_jbd2_commit_locking(journal, commit_transaction);
        stats.u.run.rs_wait = commit_transaction->t_max_wait;
        stats.u.run.rs_locked = jiffies;
        stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -484,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         */
        jbd2_journal_switch_revoke_table(journal);
+        trace_jbd2_commit_flushing(journal, commit_transaction);
        stats.u.run.rs_flushing = jiffies;
        stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
                                               stats.u.run.rs_flushing);
@@ -520,6 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_COMMIT;
        spin_unlock(&journal->j_state_lock);
+        trace_jbd2_commit_logging(journal, commit_transaction);
        stats.u.run.rs_logging = jiffies;
        stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
                                                 stats.u.run.rs_logging);
@@ -1054,9 +1057,7 @@ restart_loop:
        if (journal->j_commit_callback)
                journal->j_commit_callback(journal, commit_transaction);
-        trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
+        trace_jbd2_end_commit(journal, commit_transaction);
-                   journal->j_devname, commit_transaction->t_tid,
-                   journal->j_tail_sequence);
        jbd_debug(1, "JBD: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
        if (to_free)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 58144102bf25..e378cb383979 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -38,6 +38,10 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/math64.h>
+#include <linux/hash.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd2.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -293,6 +297,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
        struct jbd2_buffer_trigger_type *triggers;
+        journal_t *journal = transaction->t_journal;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -306,6 +311,11 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+        /* keep subsequent assertions sane */
+        new_bh->b_state = 0;
+        init_buffer(new_bh, NULL, NULL);
+        atomic_set(&new_bh->b_count, 1);
+        new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
        /*
         * If a new transaction has already done a buffer copy-out, then
@@ -384,14 +394,6 @@ repeat:
                kunmap_atomic(mapped_data, KM_USER0);
        }
-        /* keep subsequent assertions sane */
-        new_bh->b_state = 0;
-        init_buffer(new_bh, NULL, NULL);
-        atomic_set(&new_bh->b_count, 1);
-        jbd_unlock_bh_state(bh_in);
-        new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
        set_bh_page(new_bh, new_page, new_offset);
        new_jh->b_transaction = NULL;
        new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -408,7 +410,11 @@ repeat:
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-        jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_lock(&journal->j_list_lock);
+        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh_in);
        JBUFFER_TRACE(new_jh, "file as BJ_IO");
        jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
@@ -1781,7 +1787,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 * Journal abort has very specific semantics, which we describe
 * for journal abort.
 *
- * Two internal function, which provide abort to te jbd layer
+ * Two internal functions, which provide abort to the jbd layer
 * itself are here.
 */
@@ -1879,7 +1885,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
 * int jbd2_journal_errno () - returns the journal's error state.
 * @journal: journal to examine.
 *
- * This is the errno numbet set with jbd2_journal_abort(), the last
+ * This is the errno number set with jbd2_journal_abort(), the last
 * time the journal was mounted - if the journal was stopped
 * without calling abort this will be 0.
 *
@@ -1903,7 +1909,7 @@ int jbd2_journal_errno(journal_t *journal)
 * int jbd2_journal_clear_err () - clears the journal's error state
 * @journal: journal to act on.
 *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
 int jbd2_journal_clear_err(journal_t *journal)
@@ -1923,7 +1929,7 @@ int jbd2_journal_clear_err(journal_t *journal)
 * void jbd2_journal_ack_err() - Ack journal err.
 * @journal: journal to act on.
 *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
 void jbd2_journal_ack_err(journal_t *journal)
@@ -2377,6 +2383,72 @@ static void __exit journal_exit(void)
        jbd2_journal_destroy_caches();
 }
+/* 
+ * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 
+ * tracing infrastructure to map a dev_t to a device name.
+ *
+ * The caller should use rcu_read_lock() in order to make sure the
+ * device name stays valid until its done with it.  We use
+ * rcu_read_lock() as well to make sure we're safe in case the caller
+ * gets sloppy, and because rcu_read_lock() is cheap and can be safely
+ * nested.
+ */
+struct devname_cache {
+        struct rcu_head rcu;
+        dev_t           device;
+        char            devname[BDEVNAME_SIZE];
+};
+#define CACHE_SIZE_BITS 6
+static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
+static DEFINE_SPINLOCK(devname_cache_lock);
+static void free_devcache(struct rcu_head *rcu)
+{
+        kfree(rcu);
+}
+const char *jbd2_dev_to_name(dev_t device)
+{
+        int     i = hash_32(device, CACHE_SIZE_BITS);
+        char    *ret;
+        struct block_device *bd;
+        static struct devname_cache *new_dev;
+        rcu_read_lock();
+        if (devcache[i] && devcache[i]->device == device) {
+                ret = devcache[i]->devname;
+                rcu_read_unlock();
+                return ret;
+        }
+        rcu_read_unlock();
+        new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
+        if (!new_dev)
+                return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+        spin_lock(&devname_cache_lock);
+        if (devcache[i]) {
+                if (devcache[i]->device == device) {
+                        kfree(new_dev);
+                        ret = devcache[i]->devname;
+                        spin_unlock(&devname_cache_lock);
+                        return ret;
+                }
+                call_rcu(&devcache[i]->rcu, free_devcache);
+        }
+        devcache[i] = new_dev;
+        devcache[i]->device = device;
+        bd = bdget(device);
+        if (bd) {
+                bdevname(bd, devcache[i]->devname);
+                bdput(bd);
+        } else
+                __bdevname(device, devcache[i]->devname);
+        ret = devcache[i]->devname;
+        spin_unlock(&devname_cache_lock);
+        return ret;
+}
+EXPORT_SYMBOL(jbd2_dev_to_name);
 MODULE_LICENSE("GPL");
 module_init(journal_init);
 module_exit(journal_exit);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 996ffda06bf3..6213ac728f30 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -499,34 +499,15 @@ void jbd2_journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
-                struct buffer_head *bh = jh2bh(jh);
-                if (test_clear_buffer_dirty(bh))
+        printk(KERN_WARNING
-                        set_buffer_jbddirty(bh);
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-        }
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 /*
@@ -593,14 +574,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -843,6 +826,15 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous jbd2_journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1547,36 +1539,6 @@ out:
        return;
 }
-/*
- * jbd2_journal_try_to_free_buffers() could race with
- * jbd2_journal_commit_transaction(). The later might still hold the
- * reference count to the buffers when inspecting them on
- * t_syncdata_list or t_locked_list.
- *
- * jbd2_journal_try_to_free_buffers() will call this function to
- * wait for the current transaction to finish syncing data buffers, before
- * try to free that buffer.
- *
- * Called with journal->j_state_lock hold.
- */
-static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
-{
-        transaction_t *transaction;
-        tid_t tid;
-        spin_lock(&journal->j_state_lock);
-        transaction = journal->j_committing_transaction;
-        if (!transaction) {
-                spin_unlock(&journal->j_state_lock);
-                return;
-        }
-        tid = transaction->t_tid;
-        spin_unlock(&journal->j_state_lock);
-        jbd2_log_wait_commit(journal, tid);
-}
 /**
 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
@@ -1649,25 +1611,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
        ret = try_to_free_buffers(page);
-        /*
-         * There are a number of places where jbd2_journal_try_to_free_buffers()
-         * could race with jbd2_journal_commit_transaction(), the later still
-         * holds the reference to the buffers to free while processing them.
-         * try_to_free_buffers() failed to free those buffers. Some of the
-         * caller of releasepage() request page buffers to be dropped, otherwise
-         * treat the fail-to-free as errors (such as generic_file_direct_IO())
-         *
-         * So, if the caller of try_to_release_page() wants the synchronous
-         * behaviour(i.e make sure buffers are dropped upon return),
-         * let's wait for the current transaction to finish flush of
-         * dirty data buffers, then try to free those buffers again,
-         * with the journal locked.
-         */
-        if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
-                jbd2_journal_wait_for_transaction_sync_data(journal);
-                ret = try_to_free_buffers(page);
-        }
 busy:
        return ret;
 }
@@ -1693,8 +1636,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -1945,12 +1893,17 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 043740dde20c..8fcb6239218e 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -156,48 +156,25 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
        return ERR_PTR(-EINVAL);
 }
-static struct posix_acl *jffs2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = JFFS2_ACL_NOT_CACHED;
-        spin_lock(&inode->i_lock);
-        if (*i_acl != JFFS2_ACL_NOT_CACHED)
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-static void jffs2_iset_acl(struct inode *inode, struct posix_acl **i_acl, struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != JFFS2_ACL_NOT_CACHED)
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
 static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
 {
-        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        struct posix_acl *acl;
        char *value = NULL;
        int rc, xprefix;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
-                acl = jffs2_iget_acl(inode, &f->i_acl_access);
-                if (acl != JFFS2_ACL_NOT_CACHED)
-                        return acl;
                xprefix = JFFS2_XPREFIX_ACL_ACCESS;
                break;
        case ACL_TYPE_DEFAULT:
-                acl = jffs2_iget_acl(inode, &f->i_acl_default);
-                if (acl != JFFS2_ACL_NOT_CACHED)
-                        return acl;
                xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
                break;
        default:
-                return ERR_PTR(-EINVAL);
+                BUG();
        }
        rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
        if (rc > 0) {
@@ -215,16 +192,8 @@ static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
        }
        if (value)
                kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
-                switch (type) {
+                set_cached_acl(inode, type, acl);
-                case ACL_TYPE_ACCESS:
-                        jffs2_iset_acl(inode, &f->i_acl_access, acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
-                        break;
-                }
-        }
        return acl;
 }
@@ -249,7 +218,6 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
 static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
-        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        int rc, xprefix;
        if (S_ISLNK(inode->i_mode))
@@ -285,16 +253,8 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                return -EINVAL;
        }
        rc = __jffs2_set_acl(inode, xprefix, acl);
-        if (!rc) {
+        if (!rc)
-                switch(type) {
+                set_cached_acl(inode, type, acl);
-                case ACL_TYPE_ACCESS:
-                        jffs2_iset_acl(inode, &f->i_acl_access, acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
-                        break;
-                }
-        }
        return rc;
 }
@@ -321,12 +281,10 @@ int jffs2_permission(struct inode *inode, int mask)
 int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 {
-        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        struct posix_acl *acl, *clone;
        int rc;
-        f->i_acl_default = NULL;
+        cache_no_acl(inode);
-        f->i_acl_access = NULL;
        if (S_ISLNK(*i_mode))
                return 0;       /* Symlink always has no-ACL */
@@ -339,7 +297,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
                *i_mode &= ~current_umask();
        } else {
                if (S_ISDIR(*i_mode))
-                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
+                        set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
                clone = posix_acl_clone(acl, GFP_KERNEL);
                if (!clone)
@@ -350,7 +308,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
                        return rc;
                }
                if (rc > 0)
-                        jffs2_iset_acl(inode, &f->i_acl_access, clone);
+                        set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
                posix_acl_release(clone);
        }
@@ -359,17 +317,16 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 int jffs2_init_acl_post(struct inode *inode)
 {
-        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        int rc;
-        if (f->i_acl_default) {
+        if (inode->i_default_acl) {
-                rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, f->i_acl_default);
+                rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, inode->i_default_acl);
                if (rc)
                        return rc;
        }
-        if (f->i_acl_access) {
+        if (inode->i_acl) {
-                rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, f->i_acl_access);
+                rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, inode->i_acl);
                if (rc)
                        return rc;
        }
@@ -377,18 +334,6 @@ int jffs2_init_acl_post(struct inode *inode)
        return 0;
 }
-void jffs2_clear_acl(struct jffs2_inode_info *f)
-{
-        if (f->i_acl_access && f->i_acl_access != JFFS2_ACL_NOT_CACHED) {
-                posix_acl_release(f->i_acl_access);
-                f->i_acl_access = JFFS2_ACL_NOT_CACHED;
-        }
-        if (f->i_acl_default && f->i_acl_default != JFFS2_ACL_NOT_CACHED) {
-                posix_acl_release(f->i_acl_default);
-                f->i_acl_default = JFFS2_ACL_NOT_CACHED;
-        }
-}
 int jffs2_acl_chmod(struct inode *inode)
 {
        struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 8ca058aed384..fc929f2a14f6 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,13 +26,10 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-#define JFFS2_ACL_NOT_CACHED ((void *)-1)
 extern int jffs2_permission(struct inode *, int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
-extern void jffs2_clear_acl(struct jffs2_inode_info *);
 extern struct xattr_handler jffs2_acl_access_xattr_handler;
 extern struct xattr_handler jffs2_acl_default_xattr_handler;
@@ -43,6 +40,5 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
 #define jffs2_acl_chmod(inode)                  (0)
 #define jffs2_init_acl_pre(dir_i,inode,mode)    (0)
 #define jffs2_init_acl_post(inode)              (0)
-#define jffs2_clear_acl(f)
 #endif  /* CONFIG_JFFS2_FS_POSIX_ACL */
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index a0244740b75a..b47679be118a 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -270,19 +270,21 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
        D2({
                int i=0;
                struct jffs2_raw_node_ref *this;
-                printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n" KERN_DEBUG);
+                printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n");
                this = ic->nodes;
+                printk(KERN_DEBUG);
                while(this) {
-                        printk( "0x%08x(%d)->", ref_offset(this), ref_flags(this));
+                        printk(KERN_CONT "0x%08x(%d)->",
+                               ref_offset(this), ref_flags(this));
                        if (++i == 5) {
-                                printk("\n" KERN_DEBUG);
+                                printk(KERN_DEBUG);
                                i=0;
                        }
                        this = this->next_in_ino;
                }
-                printk("\n");
+                printk(KERN_CONT "\n");
        });
        switch (ic->class) {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5edc2bf20581..23c947539864 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
        kunmap(pg);
        D2(printk(KERN_DEBUG "readpage finished\n"));
-        return 0;
+        return ret;
 }
 int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 249305d65d5b..3451a81b2142 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -20,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 #include "nodelist.h"
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -387,6 +388,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
           This also catches the case where it was stopped and this
           is just a remount to restart it.
           Flush the writebuffer, if neccecary, else we loose it */
+        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY)) {
                jffs2_stop_garbage_collect_thread(c);
                mutex_lock(&c->alloc_sem);
@@ -399,24 +401,10 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
        *flags |= MS_NOATIME;
+        unlock_kernel();
        return 0;
 }
-void jffs2_write_super (struct super_block *sb)
-{
-        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-        sb->s_dirt = 0;
-        if (sb->s_flags & MS_RDONLY)
-                return;
-        D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-        jffs2_garbage_collect_trigger(c);
-        jffs2_erase_pending_blocks(c, 0);
-        jffs2_flush_wbuf_gc(c, 0);
-}
 /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
   fill in the raw_inode while you're at it. */
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri)
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 4c41db91eaa4..c6923da98263 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -50,10 +50,6 @@ struct jffs2_inode_info {
        uint16_t flags;
        uint8_t usercompr;
        struct inode vfs_inode;
-#ifdef CONFIG_JFFS2_FS_POSIX_ACL
-        struct posix_acl *i_acl_access;
-        struct posix_acl *i_acl_default;
-#endif
 };
 #endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 5e194a5c8e29..a7f03b7ebcb3 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -56,10 +56,6 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
        f->target = NULL;
        f->flags = 0;
        f->usercompr = 0;
-#ifdef CONFIG_JFFS2_FS_POSIX_ACL
-        f->i_acl_access = JFFS2_ACL_NOT_CACHED;
-        f->i_acl_default = JFFS2_ACL_NOT_CACHED;
-#endif
 }
@@ -181,7 +177,6 @@ void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
                               struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
-void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
 void jffs2_gc_release_inode(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1fc1e92356ee..1a80301004b8 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1424,7 +1424,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
        struct jffs2_full_dirent *fd, *fds;
        int deleted;
-        jffs2_clear_acl(f);
        jffs2_xattr_delete_inode(c, f->inocache);
        mutex_lock(&f->sem);
        deleted = f->inocache && !f->inocache->pino_nlink;
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 1d437de1e9a8..696686cc206e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -130,9 +130,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
        if (jffs2_sum_active()) {
                s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
                if (!s) {
-                        kfree(flashbuf);
                        JFFS2_WARNING("Can't allocate memory for summary\n");
-                        return -ENOMEM;
+                        ret = -ENOMEM;
+                        goto out;
                }
        }
@@ -196,7 +196,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                                if (c->nextblock) {
                                        ret = file_dirty(c, c->nextblock);
                                        if (ret)
-                                                return ret;
+                                                goto out;
                                        /* deleting summary information of the old nextblock */
                                        jffs2_sum_reset_collected(c->summary);
                                }
@@ -207,7 +207,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                        } else {
                                ret = file_dirty(c, jeb);
                                if (ret)
-                                        return ret;
+                                        goto out;
                        }
                        break;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 4c4e18c54a51..0035c021395a 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/fs.h>
@@ -53,10 +54,29 @@ static void jffs2_i_init_once(void *foo)
        inode_init_once(&f->vfs_inode);
 }
+static void jffs2_write_super(struct super_block *sb)
+{
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+        lock_super(sb);
+        sb->s_dirt = 0;
+        if (!(sb->s_flags & MS_RDONLY)) {
+                D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+                jffs2_garbage_collect_trigger(c);
+                jffs2_erase_pending_blocks(c, 0);
+                jffs2_flush_wbuf_gc(c, 0);
+        }
+        unlock_super(sb);
+}
 static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+        jffs2_write_super(sb);
        mutex_lock(&c->alloc_sem);
        jffs2_flush_wbuf_pad(c);
        mutex_unlock(&c->alloc_sem);
@@ -174,6 +194,11 @@ static void jffs2_put_super (struct super_block *sb)
        D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
+        lock_kernel();
+        if (sb->s_dirt)
+                jffs2_write_super(sb);
        mutex_lock(&c->alloc_sem);
        jffs2_flush_wbuf_pad(c);
        mutex_unlock(&c->alloc_sem);
@@ -192,6 +217,8 @@ static void jffs2_put_super (struct super_block *sb)
        if (c->mtd->sync)
                c->mtd->sync(c->mtd);
+        unlock_kernel();
        D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 06ca1b8d2054..a29c7c3e3fb8 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -31,27 +31,24 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 {
        struct posix_acl *acl;
        char *ea_name;
-        struct jfs_inode_info *ji = JFS_IP(inode);
-        struct posix_acl **p_acl;
        int size;
        char *value = NULL;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch(type) {
                case ACL_TYPE_ACCESS:
                        ea_name = POSIX_ACL_XATTR_ACCESS;
-                        p_acl = &ji->i_acl;
                        break;
                case ACL_TYPE_DEFAULT:
                        ea_name = POSIX_ACL_XATTR_DEFAULT;
-                        p_acl = &ji->i_default_acl;
                        break;
                default:
                        return ERR_PTR(-EINVAL);
        }
-        if (*p_acl != JFS_ACL_NOT_CACHED)
-                return posix_acl_dup(*p_acl);
        size = __jfs_getxattr(inode, ea_name, NULL, 0);
        if (size > 0) {
@@ -62,17 +59,16 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
        }
        if (size < 0) {
-                if (size == -ENODATA) {
+                if (size == -ENODATA)
-                        *p_acl = NULL;
                        acl = NULL;
-                } else
+                else
                        acl = ERR_PTR(size);
        } else {
                acl = posix_acl_from_xattr(value, size);
-                if (!IS_ERR(acl))
-                        *p_acl = posix_acl_dup(acl);
        }
        kfree(value);
+        if (!IS_ERR(acl))
+                set_cached_acl(inode, type, acl);
        return acl;
 }
@@ -80,8 +76,6 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
                       struct posix_acl *acl)
 {
        char *ea_name;
-        struct jfs_inode_info *ji = JFS_IP(inode);
-        struct posix_acl **p_acl;
        int rc;
        int size = 0;
        char *value = NULL;
@@ -92,11 +86,9 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
        switch(type) {
                case ACL_TYPE_ACCESS:
                        ea_name = POSIX_ACL_XATTR_ACCESS;
-                        p_acl = &ji->i_acl;
                        break;
                case ACL_TYPE_DEFAULT:
                        ea_name = POSIX_ACL_XATTR_DEFAULT;
-                        p_acl = &ji->i_default_acl;
                        if (!S_ISDIR(inode->i_mode))
                                return acl ? -EACCES : 0;
                        break;
@@ -116,27 +108,24 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
 out:
        kfree(value);
-        if (!rc) {
+        if (!rc)
-                if (*p_acl && (*p_acl != JFS_ACL_NOT_CACHED))
+                set_cached_acl(inode, type, acl);
-                        posix_acl_release(*p_acl);
-                *p_acl = posix_acl_dup(acl);
-        }
        return rc;
 }
 static int jfs_check_acl(struct inode *inode, int mask)
 {
-        struct jfs_inode_info *ji = JFS_IP(inode);
+        struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
-        if (ji->i_acl == JFS_ACL_NOT_CACHED) {
+        if (IS_ERR(acl))
-                struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+                return PTR_ERR(acl);
-                if (IS_ERR(acl))
+        if (acl) {
-                        return PTR_ERR(acl);
+                int error = posix_acl_permission(inode, acl, mask);
                posix_acl_release(acl);
+                return error;
        }
-        if (ji->i_acl)
-                return posix_acl_permission(inode, ji->i_acl, mask);
        return -EAGAIN;
 }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index bbbd5f202e37..41d6045dbeb0 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -391,6 +391,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
                }
                XADaddress(xp, xaddr);
                XADlength(xp, xlen);
+                XADoffset(xp, prev);
                /*
                 * only preserve the abnr flag within the xad flags
                 * of the returned hint.
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 346057218edc..0fc30407f039 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
                        txAbort(tid, 0);
                        txEnd(tid);
+                        mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
                        /* release the inode map lock */
                        IWRITE_UNLOCK(ipimap);
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 439901d205fe..1439f119ec83 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -74,10 +74,6 @@ struct jfs_inode_info {
        /* xattr_sem allows us to access the xattrs without taking i_mutex */
        struct rw_semaphore xattr_sem;
        lid_t   xtlid;          /* lid of xtree lock on directory */
-#ifdef CONFIG_JFS_POSIX_ACL
-        struct posix_acl *i_acl;
-        struct posix_acl *i_default_acl;
-#endif
        union {
                struct {
                        xtpage_t _xtroot;       /* 288: xtree root */
@@ -107,8 +103,6 @@ struct jfs_inode_info {
 #define i_inline u.link._inline
 #define i_inline_ea u.link._inline_ea
-#define JFS_ACL_NOT_CACHED ((void *)-1)
 #define IREAD_LOCK(ip, subclass) \
        down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
 #define IREAD_UNLOCK(ip)        up_read(&JFS_IP(ip)->rdwrlock)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6f21adf9479a..37e6dcda8fc8 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -32,6 +32,7 @@
 #include <linux/crc32.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -127,18 +128,6 @@ static void jfs_destroy_inode(struct inode *inode)
                ji->active_ag = -1;
        }
        spin_unlock_irq(&ji->ag_lock);
-#ifdef CONFIG_JFS_POSIX_ACL
-        if (ji->i_acl != JFS_ACL_NOT_CACHED) {
-                posix_acl_release(ji->i_acl);
-                ji->i_acl = JFS_ACL_NOT_CACHED;
-        }
-        if (ji->i_default_acl != JFS_ACL_NOT_CACHED) {
-                posix_acl_release(ji->i_default_acl);
-                ji->i_default_acl = JFS_ACL_NOT_CACHED;
-        }
-#endif
        kmem_cache_free(jfs_inode_cachep, ji);
 }
@@ -183,6 +172,9 @@ static void jfs_put_super(struct super_block *sb)
        int rc;
        jfs_info("In jfs_put_super");
+        lock_kernel();
        rc = jfs_umount(sb);
        if (rc)
                jfs_err("jfs_umount failed with return code %d", rc);
@@ -195,6 +187,8 @@ static void jfs_put_super(struct super_block *sb)
        sbi->direct_inode = NULL;
        kfree(sbi);
+        unlock_kernel();
 }
 enum {
@@ -370,19 +364,24 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
        s64 newLVSize = 0;
        int rc = 0;
        int flag = JFS_SBI(sb)->flag;
+        int ret;
        if (!parse_options(data, sb, &newLVSize, &flag)) {
                return -EINVAL;
        }
+        lock_kernel();
        if (newLVSize) {
                if (sb->s_flags & MS_RDONLY) {
                        printk(KERN_ERR
                  "JFS: resize requires volume to be mounted read-write\n");
+                        unlock_kernel();
                        return -EROFS;
                }
                rc = jfs_extendfs(sb, newLVSize, 0);
-                if (rc)
+                if (rc) {
+                        unlock_kernel();
                        return rc;
+                }
        }
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -393,23 +392,31 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
                truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
                JFS_SBI(sb)->flag = flag;
-                return jfs_mount_rw(sb, 1);
+                ret = jfs_mount_rw(sb, 1);
+                unlock_kernel();
+                return ret;
        }
        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
                rc = jfs_umount_rw(sb);
                JFS_SBI(sb)->flag = flag;
+                unlock_kernel();
                return rc;
        }
        if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
                if (!(sb->s_flags & MS_RDONLY)) {
                        rc = jfs_umount_rw(sb);
-                        if (rc)
+                        if (rc) {
+                                unlock_kernel();
                                return rc;
+                        }
                        JFS_SBI(sb)->flag = flag;
-                        return jfs_mount_rw(sb, 1);
+                        ret = jfs_mount_rw(sb, 1);
+                        unlock_kernel();
+                        return ret;
                }
        JFS_SBI(sb)->flag = flag;
+        unlock_kernel();
        return 0;
 }
@@ -720,8 +727,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
                blk++;
        }
 out:
-        if (len == towrite)
+        if (len == towrite) {
+                mutex_unlock(&inode->i_mutex);
                return err;
+        }
        if (inode->i_size < off+len-towrite)
                i_size_write(inode, off+len-towrite);
        inode->i_version++;
@@ -777,10 +786,6 @@ static void init_once(void *foo)
        init_rwsem(&jfs_ip->xattr_sem);
        spin_lock_init(&jfs_ip->ag_lock);
        jfs_ip->active_ag = -1;
-#ifdef CONFIG_JFS_POSIX_ACL
-        jfs_ip->i_acl = JFS_ACL_NOT_CACHED;
-        jfs_ip->i_default_acl = JFS_ACL_NOT_CACHED;
-#endif
        inode_init_once(&jfs_ip->vfs_inode);
 }
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 61dfa8173ebc..fad364548bc9 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -727,10 +727,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
                /*
                 * We're changing the ACL.  Get rid of the cached one
                 */
-                acl =JFS_IP(inode)->i_acl;
+                forget_cached_acl(inode, ACL_TYPE_ACCESS);
-                if (acl != JFS_ACL_NOT_CACHED)
-                        posix_acl_release(acl);
-                JFS_IP(inode)->i_acl = JFS_ACL_NOT_CACHED;
                return 0;
        } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
@@ -746,10 +743,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
                /*
                 * We're changing the default ACL.  Get rid of the cached one
                 */
-                acl =JFS_IP(inode)->i_default_acl;
+                forget_cached_acl(inode, ACL_TYPE_DEFAULT);
-                if (acl && (acl != JFS_ACL_NOT_CACHED))
-                        posix_acl_release(acl);
-                JFS_IP(inode)->i_default_acl = JFS_ACL_NOT_CACHED;
                return 0;
        }
diff --git a/fs/libfs.c b/fs/libfs.c
index 80046ddf5063..dcec3d3ea64f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -9,6 +9,8 @@
 #include <linux/vfs.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 #include <asm/uaccess.h>
@@ -215,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
                return PTR_ERR(s);
        s->s_flags = MS_NOUSER;
-        s->s_maxbytes = ~0ULL;
+        s->s_maxbytes = MAX_LFS_FILESIZE;
        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = magic;
@@ -807,6 +809,29 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
+int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = 0, /* metadata-only; caller takes care of data */
+        };
+        struct inode *inode = dentry->d_inode;
+        int err;
+        int ret;
+        ret = sync_mapping_buffers(inode->i_mapping);
+        if (!(inode->i_state & I_DIRTY))
+                return ret;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                return ret;
+        err = sync_inode(inode, &wbc);
+        if (ret == 0)
+                ret = err;
+        return ret;
+}
+EXPORT_SYMBOL(simple_fsync);
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index dd7957064a8c..4336adba952a 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,6 +7,7 @@
 */
 #include <linux/module.h>
+#include <linux/smp_lock.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -126,7 +127,6 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
        struct nlm_lock *lock = &argp->lock;
        nlmclnt_next_cookie(&argp->cookie);
-        argp->state   = nsm_local_state;
        memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh));
        lock->caller  = utsname()->nodename;
        lock->oh.data = req->a_owner;
@@ -165,6 +165,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
        /* Set up the argument struct */
        nlmclnt_setlockargs(call, fl);
+        lock_kernel();
        if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
                if (fl->fl_type != F_UNLCK) {
                        call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -178,6 +179,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
        fl->fl_ops->fl_release_private(fl);
        fl->fl_ops = NULL;
+        unlock_kernel();
        dprintk("lockd: clnt proc returns %d\n", status);
        return status;
@@ -519,6 +521,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        if (nsm_monitor(host) < 0)
                goto out;
+        req->a_args.state = nsm_local_state;
        fl->fl_flags |= FL_ACCESS;
        status = do_vfs_lock(fl);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 6d5d4a4169e5..7fce1b525849 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -53,7 +53,7 @@ static				DEFINE_SPINLOCK(nsm_lock);
 /*
 * Local NSM state
 */
-int     __read_mostly           nsm_local_state;
+u32     __read_mostly           nsm_local_state;
 int     __read_mostly           nsm_use_hostnames;
 static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
@@ -112,6 +112,7 @@ static struct rpc_clnt *nsm_create(void)
                .program                = &nsm_program,
                .version                = NSM_VERSION,
                .authflavor             = RPC_AUTH_NULL,
+                .flags                  = RPC_CLNT_CREATE_NOPING,
        };
        return rpc_create(&args);
@@ -184,13 +185,19 @@ int nsm_monitor(const struct nlm_host *host)
        nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
        status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
-        if (res.status != 0)
+        if (unlikely(res.status != 0))
                status = -EIO;
-        if (status < 0)
+        if (unlikely(status < 0)) {
                printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
-        else
+                return status;
-                nsm->sm_monitored = 1;
+        }
-        return status;
+        nsm->sm_monitored = 1;
+        if (unlikely(nsm_local_state != res.state)) {
+                nsm_local_state = res.state;
+                dprintk("lockd: NSM state changed to %d\n", nsm_local_state);
+        }
+        return 0;
 }
 /**
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 1725037374c5..bd173a6ca3b1 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/in.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 83ee34203bd7..e577a78d7bac 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -326,6 +326,8 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
 {
        if (call->a_args.lock.oh.data != call->a_owner)
                kfree(call->a_args.lock.oh.data);
+        locks_release_private(&call->a_args.lock.fl);
 }
 /*
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3688e55901fc..e1d28ddd2169 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/in.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/locks.c b/fs/locks.c
index ec3deea29e37..b6440f52178f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -151,7 +151,7 @@ static struct file_lock *locks_alloc_lock(void)
        return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
 }
-static void locks_release_private(struct file_lock *fl)
+void locks_release_private(struct file_lock *fl)
 {
        if (fl->fl_ops) {
                if (fl->fl_ops->fl_release_private)
@@ -165,6 +165,7 @@ static void locks_release_private(struct file_lock *fl)
        }
 }
+EXPORT_SYMBOL_GPL(locks_release_private);
 /* Free a lock which is not in use. */
 static void locks_free_lock(struct file_lock *fl)
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3aebe322271a..6ac693faae49 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -12,13 +12,14 @@
 /* bitmap.c contains the code that handles the inode and block bitmaps */
 #include "minix.h"
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
 #include <linux/sched.h>
 static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
+static DEFINE_SPINLOCK(bitmap_lock);
 static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits)
 {
        unsigned i, j, sum = 0;
@@ -69,11 +70,11 @@ void minix_free_block(struct inode *inode, unsigned long block)
                return;
        }
        bh = sbi->s_zmap[zone];
-        lock_kernel();
+        spin_lock(&bitmap_lock);
        if (!minix_test_and_clear_bit(bit, bh->b_data))
                printk("minix_free_block (%s:%lu): bit already cleared\n",
                       sb->s_id, block);
-        unlock_kernel();
+        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
        return;
 }
@@ -88,18 +89,18 @@ int minix_new_block(struct inode * inode)
                struct buffer_head *bh = sbi->s_zmap[i];
                int j;
-                lock_kernel();
+                spin_lock(&bitmap_lock);
                j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
                if (j < bits_per_zone) {
                        minix_set_bit(j, bh->b_data);
-                        unlock_kernel();
+                        spin_unlock(&bitmap_lock);
                        mark_buffer_dirty(bh);
                        j += i * bits_per_zone + sbi->s_firstdatazone-1;
                        if (j < sbi->s_firstdatazone || j >= sbi->s_nzones)
                                break;
                        return j;
                }
-                unlock_kernel();
+                spin_unlock(&bitmap_lock);
        }
        return 0;
 }
@@ -211,10 +212,10 @@ void minix_free_inode(struct inode * inode)
        minix_clear_inode(inode);       /* clear on-disk copy */
        bh = sbi->s_imap[ino];
-        lock_kernel();
+        spin_lock(&bitmap_lock);
        if (!minix_test_and_clear_bit(bit, bh->b_data))
                printk("minix_free_inode: bit %lu already cleared\n", bit);
-        unlock_kernel();
+        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
 out:
        clear_inode(inode);             /* clear in-memory copy */
@@ -237,7 +238,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
        j = bits_per_zone;
        bh = NULL;
        *error = -ENOSPC;
-        lock_kernel();
+        spin_lock(&bitmap_lock);
        for (i = 0; i < sbi->s_imap_blocks; i++) {
                bh = sbi->s_imap[i];
                j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
@@ -245,17 +246,17 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
                        break;
        }
        if (!bh || j >= bits_per_zone) {
-                unlock_kernel();
+                spin_unlock(&bitmap_lock);
                iput(inode);
                return NULL;
        }
        if (minix_test_and_set_bit(j, bh->b_data)) {    /* shouldn't happen */
-                unlock_kernel();
+                spin_unlock(&bitmap_lock);
                printk("minix_new_inode: bit already set\n");
                iput(inode);
                return NULL;
        }
-        unlock_kernel();
+        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
        j += i * bits_per_zone;
        if (!j || j > sbi->s_ninodes) {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d4946c4c90e2..d407e7a0b6fe 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -11,7 +11,6 @@
 #include "minix.h"
 #include <linux/buffer_head.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 typedef struct minix_dir_entry minix_dirent;
@@ -20,9 +19,10 @@ typedef struct minix3_dir_entry minix3_dirent;
 static int minix_readdir(struct file *, void *, filldir_t);
 const struct file_operations minix_dir_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = minix_readdir,
-        .fsync          = minix_sync_file,
+        .fsync          = simple_fsync,
 };
 static inline void dir_put_page(struct page *page)
@@ -102,8 +102,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
        char *name;
        __u32 inumber;
-        lock_kernel();
        pos = (pos + chunk_size-1) & ~(chunk_size-1);
        if (pos >= inode->i_size)
                goto done;
@@ -146,7 +144,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
 done:
        filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 17765f697e50..3eec3e607a87 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -6,15 +6,12 @@
 *  minix regular file handling primitives
 */
-#include <linux/buffer_head.h>          /* for fsync_inode_buffers() */
 #include "minix.h"
 /*
 * We have mostly NULLs here: the current defaults are OK for
 * the minix filesystem.
 */
-int minix_sync_file(struct file *, struct dentry *, int);
 const struct file_operations minix_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -22,7 +19,7 @@ const struct file_operations minix_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = minix_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -30,18 +27,3 @@ const struct inode_operations minix_file_inode_operations = {
        .truncate       = minix_truncate,
        .getattr        = minix_getattr,
 };
-int minix_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        
-        err |= minix_sync_inode(inode);
-        return err ? -EIO : 0;
-}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index daad3c2740db..74ea82d72164 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -48,8 +48,6 @@ static void minix_put_super(struct super_block *sb)
        kfree(sbi->s_imap);
        sb->s_fs_info = NULL;
        kfree(sbi);
-        return;
 }
 static struct kmem_cache * minix_inode_cachep;
@@ -554,38 +552,25 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
        return bh;
 }
-static struct buffer_head *minix_update_inode(struct inode *inode)
+static int minix_write_inode(struct inode *inode, int wait)
-{
-        if (INODE_VERSION(inode) == MINIX_V1)
-                return V1_minix_update_inode(inode);
-        else
-                return V2_minix_update_inode(inode);
-}
-static int minix_write_inode(struct inode * inode, int wait)
-{
-        brelse(minix_update_inode(inode));
-        return 0;
-}
-int minix_sync_inode(struct inode * inode)
 {
        int err = 0;
        struct buffer_head *bh;
-        bh = minix_update_inode(inode);
+        if (INODE_VERSION(inode) == MINIX_V1)
-        if (bh && buffer_dirty(bh))
+                bh = V1_minix_update_inode(inode);
-        {
+        else
+                bh = V2_minix_update_inode(inode);
+        if (!bh)
+                return -EIO;
+        if (wait && buffer_dirty(bh)) {
                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh))
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                {
                        printk("IO error syncing minix inode [%s:%08lx]\n",
                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
+                        err = -EIO;
                }
        }
-        else if (!bh)
-                err = -1;
        brelse (bh);
        return err;
 }
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index e6a0b193bea4..9dcf95b42116 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -1,3 +1,6 @@
+#ifndef FS_MINIX_H
+#define FS_MINIX_H
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/minix_fs.h>
@@ -57,7 +60,6 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping,
 extern void V1_minix_truncate(struct inode *);
 extern void V2_minix_truncate(struct inode *);
 extern void minix_truncate(struct inode *);
-extern int minix_sync_inode(struct inode *);
 extern void minix_set_inode(struct inode *, dev_t);
 extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
 extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
@@ -72,7 +74,6 @@ extern int minix_empty_dir(struct inode*);
 extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
 extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
 extern ino_t minix_inode_by_name(struct dentry*);
-extern int minix_sync_file(struct file *, struct dentry *, int);
 extern const struct inode_operations minix_file_inode_operations;
 extern const struct inode_operations minix_dir_inode_operations;
@@ -88,3 +89,5 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
 {
        return list_entry(inode, struct minix_inode_info, vfs_inode);
 }
+#endif /* FS_MINIX_H */
diff --git a/fs/mpage.c b/fs/mpage.c
index 680ba60863ff..42381bd6543b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-        clear_buffer_mapped(&map_bh);
+        map_bh.b_state = 0;
+        map_bh.b_size = 0;
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                struct page *page = list_entry(pages->prev, struct page, lru);
@@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-        clear_buffer_mapped(&map_bh);
+        map_bh.b_state = 0;
+        map_bh.b_size = 0;
        bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
                        &map_bh, &first_logical_block, get_block);
        if (bio)
diff --git a/fs/namei.c b/fs/namei.c
index 967c3db92724..f3c5b278895a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -552,6 +552,17 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
        return result;
 }
+static __always_inline void set_root(struct nameidata *nd)
+{
+        if (!nd->root.mnt) {
+                struct fs_struct *fs = current->fs;
+                read_lock(&fs->lock);
+                nd->root = fs->root;
+                path_get(&nd->root);
+                read_unlock(&fs->lock);
+        }
+}
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
        int res = 0;
@@ -560,14 +571,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                goto fail;
        if (*link == '/') {
-                struct fs_struct *fs = current->fs;
+                set_root(nd);
                path_put(&nd->path);
+                nd->path = nd->root;
-                read_lock(&fs->lock);
+                path_get(&nd->root);
-                nd->path = fs->root;
-                path_get(&fs->root);
-                read_unlock(&fs->lock);
        }
        res = link_path_walk(link, nd);
@@ -668,23 +675,23 @@ loop:
        return err;
 }
-int follow_up(struct vfsmount **mnt, struct dentry **dentry)
+int follow_up(struct path *path)
 {
        struct vfsmount *parent;
        struct dentry *mountpoint;
        spin_lock(&vfsmount_lock);
-        parent=(*mnt)->mnt_parent;
+        parent = path->mnt->mnt_parent;
-        if (parent == *mnt) {
+        if (parent == path->mnt) {
                spin_unlock(&vfsmount_lock);
                return 0;
        }
        mntget(parent);
-        mountpoint=dget((*mnt)->mnt_mountpoint);
+        mountpoint = dget(path->mnt->mnt_mountpoint);
        spin_unlock(&vfsmount_lock);
-        dput(*dentry);
+        dput(path->dentry);
-        *dentry = mountpoint;
+        path->dentry = mountpoint;
-        mntput(*mnt);
+        mntput(path->mnt);
-        *mnt = parent;
+        path->mnt = parent;
        return 1;
 }
@@ -695,7 +702,7 @@ static int __follow_mount(struct path *path)
 {
        int res = 0;
        while (d_mountpoint(path->dentry)) {
-                struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
+                struct vfsmount *mounted = lookup_mnt(path);
                if (!mounted)
                        break;
                dput(path->dentry);
@@ -708,32 +715,32 @@ static int __follow_mount(struct path *path)
        return res;
 }
-static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static void follow_mount(struct path *path)
 {
-        while (d_mountpoint(*dentry)) {
+        while (d_mountpoint(path->dentry)) {
-                struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
+                struct vfsmount *mounted = lookup_mnt(path);
                if (!mounted)
                        break;
-                dput(*dentry);
+                dput(path->dentry);
-                mntput(*mnt);
+                mntput(path->mnt);
-                *mnt = mounted;
+                path->mnt = mounted;
-                *dentry = dget(mounted->mnt_root);
+                path->dentry = dget(mounted->mnt_root);
        }
 }
 /* no need for dcache_lock, as serialization is taken care in
 * namespace.c
 */
-int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+int follow_down(struct path *path)
 {
        struct vfsmount *mounted;
-        mounted = lookup_mnt(*mnt, *dentry);
+        mounted = lookup_mnt(path);
        if (mounted) {
-                dput(*dentry);
+                dput(path->dentry);
-                mntput(*mnt);
+                mntput(path->mnt);
-                *mnt = mounted;
+                path->mnt = mounted;
-                *dentry = dget(mounted->mnt_root);
+                path->dentry = dget(mounted->mnt_root);
                return 1;
        }
        return 0;
@@ -741,19 +748,16 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 static __always_inline void follow_dotdot(struct nameidata *nd)
 {
-        struct fs_struct *fs = current->fs;
+        set_root(nd);
        while(1) {
                struct vfsmount *parent;
                struct dentry *old = nd->path.dentry;
-                read_lock(&fs->lock);
+                if (nd->path.dentry == nd->root.dentry &&
-                if (nd->path.dentry == fs->root.dentry &&
+                    nd->path.mnt == nd->root.mnt) {
-                    nd->path.mnt == fs->root.mnt) {
-                        read_unlock(&fs->lock);
                        break;
                }
-                read_unlock(&fs->lock);
                spin_lock(&dcache_lock);
                if (nd->path.dentry != nd->path.mnt->mnt_root) {
                        nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -775,7 +779,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
                mntput(nd->path.mnt);
                nd->path.mnt = parent;
        }
-        follow_mount(&nd->path.mnt, &nd->path.dentry);
+        follow_mount(&nd->path);
 }
 /*
@@ -853,7 +857,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        err = inode_permission(nd->path.dentry->d_inode,
                                               MAY_EXEC);
                if (!err)
-                        err = ima_path_check(&nd->path, MAY_EXEC);
+                        err = ima_path_check(&nd->path, MAY_EXEC,
+                                             IMA_COUNT_UPDATE);
                if (err)
                        break;
@@ -1016,25 +1021,23 @@ static int path_walk(const char *name, struct nameidata *nd)
        return link_path_walk(name, nd);
 }
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
-static int do_path_lookup(int dfd, const char *name,
-                                unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
        int fput_needed;
        struct file *file;
-        struct fs_struct *fs = current->fs;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
        nd->flags = flags;
        nd->depth = 0;
+        nd->root.mnt = NULL;
        if (*name=='/') {
-                read_lock(&fs->lock);
+                set_root(nd);
-                nd->path = fs->root;
+                nd->path = nd->root;
-                path_get(&fs->root);
+                path_get(&nd->root);
-                read_unlock(&fs->lock);
        } else if (dfd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
                read_lock(&fs->lock);
                nd->path = fs->pwd;
                path_get(&fs->pwd);
@@ -1062,17 +1065,29 @@ static int do_path_lookup(int dfd, const char *name,
                fput_light(file, fput_needed);
        }
+        return 0;
-        retval = path_walk(name, nd);
+fput_fail:
+        fput_light(file, fput_needed);
+out_fail:
+        return retval;
+}
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int do_path_lookup(int dfd, const char *name,
+                                unsigned int flags, struct nameidata *nd)
+{
+        int retval = path_init(dfd, name, flags, nd);
+        if (!retval)
+                retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
                                nd->path.dentry->d_inode))
                audit_inode(name, nd->path.dentry);
-out_fail:
+        if (nd->root.mnt) {
+                path_put(&nd->root);
+                nd->root.mnt = NULL;
+        }
        return retval;
-fput_fail:
-        fput_light(file, fput_needed);
-        goto out_fail;
 }
 int path_lookup(const char *name, unsigned int flags,
@@ -1112,14 +1127,18 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        nd->path.dentry = dentry;
        nd->path.mnt = mnt;
        path_get(&nd->path);
+        nd->root = nd->path;
+        path_get(&nd->root);
        retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
                                nd->path.dentry->d_inode))
                audit_inode(name, nd->path.dentry);
-        return retval;
+        path_put(&nd->root);
+        nd->root.mnt = NULL;
+        return retval;
 }
 /**
@@ -1515,7 +1534,8 @@ int may_open(struct path *path, int acc_mode, int flag)
                return error;
        error = ima_path_check(path,
-                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                               IMA_COUNT_UPDATE);
        if (error)
                return error;
        /*
@@ -1674,9 +1694,17 @@ struct file *do_filp_open(int dfd, const char *pathname,
        /*
         * Create - we need to know the parent.
         */
-        error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
        if (error)
                return ERR_PTR(error);
+        error = path_walk(pathname, &nd);
+        if (error) {
+                if (nd.root.mnt)
+                        path_put(&nd.root);
+                return ERR_PTR(error);
+        }
+        if (unlikely(!audit_dummy_context()))
+                audit_inode(pathname, nd.path.dentry);
        /*
         * We have the parent and last component. First of all, check
@@ -1733,7 +1761,13 @@ do_last:
                        goto exit;
                }
                filp = nameidata_to_filp(&nd, open_flag);
+                if (IS_ERR(filp))
+                        ima_counts_put(&nd.path,
+                                       acc_mode & (MAY_READ | MAY_WRITE |
+                                                   MAY_EXEC));
                mnt_drop_write(nd.path.mnt);
+                if (nd.root.mnt)
+                        path_put(&nd.root);
                return filp;
        }
@@ -1787,6 +1821,9 @@ ok:
                goto exit;
        }
        filp = nameidata_to_filp(&nd, open_flag);
+        if (IS_ERR(filp))
+                ima_counts_put(&nd.path,
+                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
        /*
         * It is now safe to drop the mnt write
         * because the filp has had a write taken
@@ -1794,6 +1831,8 @@ ok:
         */
        if (will_write)
                mnt_drop_write(nd.path.mnt);
+        if (nd.root.mnt)
+                path_put(&nd.root);
        return filp;
 exit_mutex_unlock:
@@ -1804,6 +1843,8 @@ exit:
        if (!IS_ERR(nd.intent.open.file))
                release_open_intent(&nd);
 exit_parent:
+        if (nd.root.mnt)
+                path_put(&nd.root);
        path_put(&nd.path);
        return ERR_PTR(error);
@@ -1832,6 +1873,8 @@ do_link:
                 * with "intent.open".
                 */
                release_open_intent(&nd);
+                if (nd.root.mnt)
+                        path_put(&nd.root);
                return ERR_PTR(error);
        }
        nd.flags &= ~LOOKUP_PARENT;
diff --git a/fs/namespace.c b/fs/namespace.c
index 134d494158d9..7230787d18b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -22,6 +22,7 @@
 #include <linux/seq_file.h>
 #include <linux/mnt_namespace.h>
 #include <linux/namei.h>
+#include <linux/nsproxy.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/ramfs.h>
@@ -42,6 +43,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
+static int mnt_id_start = 0;
+static int mnt_group_start = 1;
 static struct list_head *mount_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
@@ -69,7 +72,9 @@ static int mnt_alloc_id(struct vfsmount *mnt)
 retry:
        ida_pre_get(&mnt_id_ida, GFP_KERNEL);
        spin_lock(&vfsmount_lock);
-        res = ida_get_new(&mnt_id_ida, &mnt->mnt_id);
+        res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
+        if (!res)
+                mnt_id_start = mnt->mnt_id + 1;
        spin_unlock(&vfsmount_lock);
        if (res == -EAGAIN)
                goto retry;
@@ -79,8 +84,11 @@ retry:
 static void mnt_free_id(struct vfsmount *mnt)
 {
+        int id = mnt->mnt_id;
        spin_lock(&vfsmount_lock);
-        ida_remove(&mnt_id_ida, mnt->mnt_id);
+        ida_remove(&mnt_id_ida, id);
+        if (mnt_id_start > id)
+                mnt_id_start = id;
        spin_unlock(&vfsmount_lock);
 }
@@ -91,10 +99,18 @@ static void mnt_free_id(struct vfsmount *mnt)
 */
 static int mnt_alloc_group_id(struct vfsmount *mnt)
 {
+        int res;
        if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
                return -ENOMEM;
-        return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id);
+        res = ida_get_new_above(&mnt_group_ida,
+                                mnt_group_start,
+                                &mnt->mnt_group_id);
+        if (!res)
+                mnt_group_start = mnt->mnt_group_id + 1;
+        return res;
 }
 /*
@@ -102,7 +118,10 @@ static int mnt_alloc_group_id(struct vfsmount *mnt)
 */
 void mnt_release_group_id(struct vfsmount *mnt)
 {
-        ida_remove(&mnt_group_ida, mnt->mnt_group_id);
+        int id = mnt->mnt_group_id;
+        ida_remove(&mnt_group_ida, id);
+        if (mnt_group_start > id)
+                mnt_group_start = id;
        mnt->mnt_group_id = 0;
 }
@@ -131,10 +150,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
-                atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+                mnt->mnt_writers = alloc_percpu(int);
+                if (!mnt->mnt_writers)
+                        goto out_free_devname;
+#else
+                mnt->mnt_writers = 0;
+#endif
        }
        return mnt;
+#ifdef CONFIG_SMP
+out_free_devname:
+        kfree(mnt->mnt_devname);
+#endif
 out_free_id:
        mnt_free_id(mnt);
 out_free_cache:
@@ -171,65 +200,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-struct mnt_writer {
+static inline void inc_mnt_writers(struct vfsmount *mnt)
-        /*
+{
-         * If holding multiple instances of this lock, they
+#ifdef CONFIG_SMP
-         * must be ordered by cpu number.
+        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
-         */
+#else
-        spinlock_t lock;
+        mnt->mnt_writers++;
-        struct lock_class_key lock_class; /* compiles out with !lockdep */
+#endif
-        unsigned long count;
+}
-        struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
 {
-        int cpu;
+#ifdef CONFIG_SMP
-        for_each_possible_cpu(cpu) {
+        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
-                struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+#else
-                spin_lock_init(&writer->lock);
+        mnt->mnt_writers--;
-                lockdep_set_class(&writer->lock, &writer->lock_class);
+#endif
-                writer->count = 0;
-        }
-        return 0;
 }
-fs_initcall(init_mnt_writers);
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
+        unsigned int count = 0;
        int cpu;
-        struct mnt_writer *cpu_writer;
        for_each_possible_cpu(cpu) {
-                cpu_writer = &per_cpu(mnt_writers, cpu);
+                count += *per_cpu_ptr(mnt->mnt_writers, cpu);
-                spin_unlock(&cpu_writer->lock);
        }
-}
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+        return count;
-{
+#else
-        if (!cpu_writer->mnt)
+        return mnt->mnt_writers;
-                return;
+#endif
-        /*
-         * This is in case anyone ever leaves an invalid,
-         * old ->mnt and a count of 0.
-         */
-        if (!cpu_writer->count)
-                return;
-        atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
-        cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
-                                          struct vfsmount *mnt)
-{
-        if (cpu_writer->mnt == mnt)
-                return;
-        __clear_mnt_count(cpu_writer);
-        cpu_writer->mnt = mnt;
 }
 /*
@@ -253,74 +255,74 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
 int mnt_want_write(struct vfsmount *mnt)
 {
        int ret = 0;
-        struct mnt_writer *cpu_writer;
-        cpu_writer = &get_cpu_var(mnt_writers);
+        preempt_disable();
-        spin_lock(&cpu_writer->lock);
+        inc_mnt_writers(mnt);
+        /*
+         * The store to inc_mnt_writers must be visible before we pass
+         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+         * incremented count after it has set MNT_WRITE_HOLD.
+         */
+        smp_mb();
+        while (mnt->mnt_flags & MNT_WRITE_HOLD)
+                cpu_relax();
+        /*
+         * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+         * be set to match its requirements. So we must not load that until
+         * MNT_WRITE_HOLD is cleared.
+         */
+        smp_rmb();
        if (__mnt_is_readonly(mnt)) {
+                dec_mnt_writers(mnt);
                ret = -EROFS;
                goto out;
        }
-        use_cpu_writer_for_mount(cpu_writer, mnt);
-        cpu_writer->count++;
 out:
-        spin_unlock(&cpu_writer->lock);
+        preempt_enable();
-        put_cpu_var(mnt_writers);
        return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
-static void lock_mnt_writers(void)
+/**
-{
+ * mnt_clone_write - get write access to a mount
-        int cpu;
+ * @mnt: the mount on which to take a write
-        struct mnt_writer *cpu_writer;
+ *
+ * This is effectively like mnt_want_write, except
-        for_each_possible_cpu(cpu) {
+ * it must only be used to take an extra write reference
-                cpu_writer = &per_cpu(mnt_writers, cpu);
+ * on a mountpoint that we already know has a write reference
-                spin_lock(&cpu_writer->lock);
+ * on it. This allows some optimisation.
-                __clear_mnt_count(cpu_writer);
+ *
-                cpu_writer->mnt = NULL;
+ * After finished, mnt_drop_write must be called as usual to
-        }
+ * drop the reference.
+ */
+int mnt_clone_write(struct vfsmount *mnt)
+{
+        /* superblock may be r/o */
+        if (__mnt_is_readonly(mnt))
+                return -EROFS;
+        preempt_disable();
+        inc_mnt_writers(mnt);
+        preempt_enable();
+        return 0;
 }
+EXPORT_SYMBOL_GPL(mnt_clone_write);
-/*
+/**
- * These per-cpu write counts are not guaranteed to have
+ * mnt_want_write_file - get write access to a file's mount
- * matched increments and decrements on any given cpu.
+ * @file: the file who's mount on which to take a write
- * A file open()ed for write on one cpu and close()d on
+ *
- * another cpu will imbalance this count.  Make sure it
+ * This is like mnt_want_write, but it takes a file and can
- * does not get too far out of whack.
+ * do some optimisations if the file is open for write already
 */
-static void handle_write_count_underflow(struct vfsmount *mnt)
+int mnt_want_write_file(struct file *file)
 {
-        if (atomic_read(&mnt->__mnt_writers) >=
+        struct inode *inode = file->f_dentry->d_inode;
-            MNT_WRITER_UNDERFLOW_LIMIT)
+        if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
-                return;
+                return mnt_want_write(file->f_path.mnt);
-        /*
+        else
-         * It isn't necessary to hold all of the locks
+                return mnt_clone_write(file->f_path.mnt);
-         * at the same time, but doing it this way makes
-         * us share a lot more code.
-         */
-        lock_mnt_writers();
-        /*
-         * vfsmount_lock is for mnt_flags.
-         */
-        spin_lock(&vfsmount_lock);
-        /*
-         * If coalescing the per-cpu writer counts did not
-         * get us back to a positive writer count, we have
-         * a bug.
-         */
-        if ((atomic_read(&mnt->__mnt_writers) < 0) &&
-            !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-                WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
-                                "count: %d\n",
-                        mnt, atomic_read(&mnt->__mnt_writers));
-                /* use the flag to keep the dmesg spam down */
-                mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
-        }
-        spin_unlock(&vfsmount_lock);
-        unlock_mnt_writers();
 }
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
 /**
 * mnt_drop_write - give up write access to a mount
@@ -332,37 +334,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
 */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-        int must_check_underflow = 0;
+        preempt_disable();
-        struct mnt_writer *cpu_writer;
+        dec_mnt_writers(mnt);
+        preempt_enable();
-        cpu_writer = &get_cpu_var(mnt_writers);
-        spin_lock(&cpu_writer->lock);
-        use_cpu_writer_for_mount(cpu_writer, mnt);
-        if (cpu_writer->count > 0) {
-                cpu_writer->count--;
-        } else {
-                must_check_underflow = 1;
-                atomic_dec(&mnt->__mnt_writers);
-        }
-        spin_unlock(&cpu_writer->lock);
-        /*
-         * Logically, we could call this each time,
-         * but the __mnt_writers cacheline tends to
-         * be cold, and makes this expensive.
-         */
-        if (must_check_underflow)
-                handle_write_count_underflow(mnt);
-        /*
-         * This could be done right after the spinlock
-         * is taken because the spinlock keeps us on
-         * the cpu, and disables preemption.  However,
-         * putting it here bounds the amount that
-         * __mnt_writers can underflow.  Without it,
-         * we could theoretically wrap __mnt_writers.
-         */
-        put_cpu_var(mnt_writers);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -370,24 +344,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
        int ret = 0;
-        lock_mnt_writers();
+        spin_lock(&vfsmount_lock);
+        mnt->mnt_flags |= MNT_WRITE_HOLD;
        /*
-         * With all the locks held, this value is stable
+         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+         * should be visible before we do.
         */
-        if (atomic_read(&mnt->__mnt_writers) > 0) {
+        smp_mb();
-                ret = -EBUSY;
-                goto out;
-        }
        /*
-         * nobody can do a successful mnt_want_write() with all
+         * With writers on hold, if this value is zero, then there are
-         * of the counts in MNT_DENIED_WRITE and the locks held.
+         * definitely no active writers (although held writers may subsequently
+         * increment the count, they'll have to wait, and decrement it after
+         * seeing MNT_READONLY).
+         *
+         * It is OK to have counter incremented on one CPU and decremented on
+         * another: the sum will add up correctly. The danger would be when we
+         * sum up each counter, if we read a counter before it is incremented,
+         * but then read another CPU's count which it has been subsequently
+         * decremented from -- we would see more decrements than we should.
+         * MNT_WRITE_HOLD protects against this scenario, because
+         * mnt_want_write first increments count, then smp_mb, then spins on
+         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+         * we're counting up here.
         */
-        spin_lock(&vfsmount_lock);
+        if (count_mnt_writers(mnt) > 0)
-        if (!ret)
+                ret = -EBUSY;
+        else
                mnt->mnt_flags |= MNT_READONLY;
+        /*
+         * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+         * that become unheld will see MNT_READONLY.
+         */
+        smp_wmb();
+        mnt->mnt_flags &= ~MNT_WRITE_HOLD;
        spin_unlock(&vfsmount_lock);
-out:
-        unlock_mnt_writers();
        return ret;
 }
@@ -410,6 +401,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+        free_percpu(mnt->mnt_writers);
+#endif
        kmem_cache_free(mnt_cache, mnt);
 }
@@ -442,11 +436,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 * lookup_mnt increments the ref count before returning
 * the vfsmount struct.
 */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *lookup_mnt(struct path *path)
 {
        struct vfsmount *child_mnt;
        spin_lock(&vfsmount_lock);
-        if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
+        if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
                mntget(child_mnt);
        spin_unlock(&vfsmount_lock);
        return child_mnt;
@@ -604,38 +598,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 static inline void __mntput(struct vfsmount *mnt)
 {
-        int cpu;
        struct super_block *sb = mnt->mnt_sb;
        /*
-         * We don't have to hold all of the locks at the
-         * same time here because we know that we're the
-         * last reference to mnt and that no new writers
-         * can come in.
-         */
-        for_each_possible_cpu(cpu) {
-                struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-                spin_lock(&cpu_writer->lock);
-                if (cpu_writer->mnt != mnt) {
-                        spin_unlock(&cpu_writer->lock);
-                        continue;
-                }
-                atomic_add(cpu_writer->count, &mnt->__mnt_writers);
-                cpu_writer->count = 0;
-                /*
-                 * Might as well do this so that no one
-                 * ever sees the pointer and expects
-                 * it to be valid.
-                 */
-                cpu_writer->mnt = NULL;
-                spin_unlock(&cpu_writer->lock);
-        }
-        /*
         * This probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this
         * happens, the filesystem was probably unable
         * to make r/w->r/o transitions.
         */
-        WARN_ON(atomic_read(&mnt->__mnt_writers));
+        /*
+         * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+         * provides barriers, so count_mnt_writers() below is safe.  AV
+         */
+        WARN_ON(count_mnt_writers(mnt));
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
@@ -1106,11 +1080,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
                 * we just try to remount it readonly.
                 */
                down_write(&sb->s_umount);
-                if (!(sb->s_flags & MS_RDONLY)) {
+                if (!(sb->s_flags & MS_RDONLY))
-                        lock_kernel();
                        retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
-                        unlock_kernel();
-                }
                up_write(&sb->s_umount);
                return retval;
        }
@@ -1253,11 +1224,11 @@ Enomem:
        return NULL;
 }
-struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *collect_mounts(struct path *path)
 {
        struct vfsmount *tree;
        down_write(&namespace_sem);
-        tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
+        tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
        up_write(&namespace_sem);
        return tree;
 }
@@ -1430,7 +1401,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
                goto out_unlock;
        err = -ENOENT;
-        if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
+        if (!d_unlinked(path->dentry))
                err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
        mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1601,7 +1572,7 @@ static int do_move_mount(struct path *path, char *old_name)
        down_write(&namespace_sem);
        while (d_mountpoint(path->dentry) &&
-               follow_down(&path->mnt, &path->dentry))
+               follow_down(path))
                ;
        err = -EINVAL;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1612,7 +1583,7 @@ static int do_move_mount(struct path *path, char *old_name)
        if (IS_DEADDIR(path->dentry->d_inode))
                goto out1;
-        if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
+        if (d_unlinked(path->dentry))
                goto out1;
        err = -EINVAL;
@@ -1676,7 +1647,9 @@ static int do_new_mount(struct path *path, char *type, int flags,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        lock_kernel();
        mnt = do_kern_mount(type, flags, name, data);
+        unlock_kernel();
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
@@ -1695,10 +1668,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
        while (d_mountpoint(path->dentry) &&
-               follow_down(&path->mnt, &path->dentry))
+               follow_down(path))
                ;
        err = -EINVAL;
-        if (!check_mnt(path->mnt))
+        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
                goto unlock;
        /* Refuse the same filesystem on the same mount point */
@@ -1984,6 +1957,21 @@ dput_out:
        return retval;
 }
+static struct mnt_namespace *alloc_mnt_ns(void)
+{
+        struct mnt_namespace *new_ns;
+        new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+        if (!new_ns)
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&new_ns->count, 1);
+        new_ns->root = NULL;
+        INIT_LIST_HEAD(&new_ns->list);
+        init_waitqueue_head(&new_ns->poll);
+        new_ns->event = 0;
+        return new_ns;
+}
 /*
 * Allocate a new namespace structure and populate it with contents
 * copied from the namespace of the passed in task structure.
@@ -1995,14 +1983,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct vfsmount *p, *q;
-        new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+        new_ns = alloc_mnt_ns();
-        if (!new_ns)
+        if (IS_ERR(new_ns))
-                return ERR_PTR(-ENOMEM);
+                return new_ns;
-        atomic_set(&new_ns->count, 1);
-        INIT_LIST_HEAD(&new_ns->list);
-        init_waitqueue_head(&new_ns->poll);
-        new_ns->event = 0;
        down_write(&namespace_sem);
        /* First pass: copy the tree topology */
@@ -2066,6 +2049,24 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
        return new_ns;
 }
+/**
+ * create_mnt_ns - creates a private namespace and adds a root filesystem
+ * @mnt: pointer to the new root filesystem mountpoint
+ */
+struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
+{
+        struct mnt_namespace *new_ns;
+        new_ns = alloc_mnt_ns();
+        if (!IS_ERR(new_ns)) {
+                mnt->mnt_ns = new_ns;
+                new_ns->root = mnt;
+                list_add(&new_ns->list, &new_ns->root->mnt_list);
+        }
+        return new_ns;
+}
+EXPORT_SYMBOL(create_mnt_ns);
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
 {
@@ -2092,10 +2093,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
        if (retval < 0)
                goto out3;
-        lock_kernel();
        retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
                          flags, (void *)data_page);
-        unlock_kernel();
        free_page(data_page);
 out3:
@@ -2175,9 +2174,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        error = -ENOENT;
        if (IS_DEADDIR(new.dentry->d_inode))
                goto out2;
-        if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
+        if (d_unlinked(new.dentry))
                goto out2;
-        if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
+        if (d_unlinked(old.dentry))
                goto out2;
        error = -EBUSY;
        if (new.mnt == root.mnt ||
@@ -2243,16 +2242,9 @@ static void __init init_mount_tree(void)
        mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");
-        ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+        ns = create_mnt_ns(mnt);
-        if (!ns)
+        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
-        atomic_set(&ns->count, 1);
-        INIT_LIST_HEAD(&ns->list);
-        init_waitqueue_head(&ns->poll);
-        ns->event = 0;
-        list_add(&mnt->mnt_list, &ns->list);
-        ns->root = mnt;
-        mnt->mnt_ns = ns;
        init_task.nsproxy->mnt_ns = ns;
        get_mnt_ns(ns);
@@ -2295,10 +2287,14 @@ void __init mnt_init(void)
        init_mount_tree();
 }
-void __put_mnt_ns(struct mnt_namespace *ns)
+void put_mnt_ns(struct mnt_namespace *ns)
 {
-        struct vfsmount *root = ns->root;
+        struct vfsmount *root;
        LIST_HEAD(umount_list);
+        if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock))
+                return;
+        root = ns->root;
        ns->root = NULL;
        spin_unlock(&vfsmount_lock);
        down_write(&namespace_sem);
@@ -2309,3 +2305,4 @@ void __put_mnt_ns(struct mnt_namespace *ns)
        release_mounts(&umount_list);
        kfree(ns);
 }
+EXPORT_SYMBOL(put_mnt_ns);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d642f0e5b365..b99ce205b1bd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -736,6 +736,8 @@ static void ncp_put_super(struct super_block *sb)
 {
        struct ncp_server *server = NCP_SBP(sb);
+        lock_kernel();
        ncp_lock_server(server);
        ncp_disconnect(server);
        ncp_unlock_server(server);
@@ -769,6 +771,8 @@ static void ncp_put_super(struct super_block *sb)
        vfree(server->packet);
        sb->s_fs_info = NULL;
        kfree(server);
+        unlock_kernel();
 }
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 97645f112114..0ec6237a5970 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
                if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
                        int k;
+                        unicode_t u;
-                        k = utf8_mbtowc(&ec, iname, iname_end - iname);
+                        k = utf8_to_utf32(iname, iname_end - iname, &u);
-                        if (k < 0)
+                        if (k < 0 || u > MAX_WCHAR_T)
                                return -EINVAL;
                        iname += k;
+                        ec = u;
                } else {
                        if (*iname == NCP_ESC) {
                                int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
                if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
                        int k;
-                        k = utf8_wctomb(iname, ec, iname_end - iname);
+                        k = utf32_to_utf8(ec, iname, iname_end - iname);
                        if (k < 0) {
                                err = -ENAMETOOLONG;
                                goto quit;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index e67f3ec07736..2a77bc25d5af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,6 @@
 config NFS_FS
        tristate "NFS client support"
-        depends on INET
+        depends on INET && FILE_LOCKING
        select LOCKD
        select SUNRPC
        select NFS_ACL_SUPPORT if NFS_V3_ACL
@@ -74,6 +74,15 @@ config NFS_V4
          If unsure, say N.
+config NFS_V4_1
+        bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)"
+        depends on NFS_V4 && EXPERIMENTAL
+        help
+          This option enables support for minor version 1 of the NFSv4 protocol
+          (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+          Unless you're an NFS developer, say N.
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a886e692ddd0..7f604c7941fb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,6 +17,9 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
+#if defined(CONFIG_NFS_V4_1)
+#include <linux/sunrpc/bc_xprt.h>
+#endif
 #include <net/inet_sock.h>
@@ -28,11 +31,12 @@
 struct nfs_callback_data {
        unsigned int users;
+        struct svc_serv *serv;
        struct svc_rqst *rqst;
        struct task_struct *task;
 };
-static struct nfs_callback_data nfs_callback_info;
+static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
 static DEFINE_MUTEX(nfs_callback_mutex);
 static struct svc_program nfs4_callback_program;
@@ -56,10 +60,10 @@ module_param_call(callback_tcpport, param_set_port, param_get_int,
                 &nfs_callback_set_tcpport, 0644);
 /*
- * This is the callback kernel thread.
+ * This is the NFSv4 callback kernel thread.
 */
 static int
-nfs_callback_svc(void *vrqstp)
+nfs4_callback_svc(void *vrqstp)
 {
        int err, preverr = 0;
        struct svc_rqst *rqstp = vrqstp;
@@ -97,20 +101,12 @@ nfs_callback_svc(void *vrqstp)
 }
 /*
- * Bring up the callback thread if it is not already up.
+ * Prepare to bring up the NFSv4 callback service
 */
-int nfs_callback_up(void)
+struct svc_rqst *
+nfs4_callback_up(struct svc_serv *serv)
 {
-        struct svc_serv *serv = NULL;
+        int ret;
-        int ret = 0;
-        mutex_lock(&nfs_callback_mutex);
-        if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
-                goto out;
-        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
-        ret = -ENOMEM;
-        if (!serv)
-                goto out_err;
        ret = svc_create_xprt(serv, "tcp", PF_INET,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
@@ -127,27 +123,174 @@ int nfs_callback_up(void)
                nfs_callback_tcpport6 = ret;
                dprintk("NFS: Callback listener port = %u (af %u)\n",
                                nfs_callback_tcpport6, PF_INET6);
-        } else if (ret != -EAFNOSUPPORT)
+        } else if (ret == -EAFNOSUPPORT)
+                ret = 0;
+        else
                goto out_err;
 #endif  /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
-        nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+        return svc_prepare_thread(serv, &serv->sv_pools[0]);
-        if (IS_ERR(nfs_callback_info.rqst)) {
-                ret = PTR_ERR(nfs_callback_info.rqst);
+out_err:
-                nfs_callback_info.rqst = NULL;
+        if (ret == 0)
+                ret = -ENOMEM;
+        return ERR_PTR(ret);
+}
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * The callback service for NFSv4.1 callbacks
+ */
+static int
+nfs41_callback_svc(void *vrqstp)
+{
+        struct svc_rqst *rqstp = vrqstp;
+        struct svc_serv *serv = rqstp->rq_server;
+        struct rpc_rqst *req;
+        int error;
+        DEFINE_WAIT(wq);
+        set_freezable();
+        /*
+         * FIXME: do we really need to run this under the BKL? If so, please
+         * add a comment about what it's intended to protect.
+         */
+        lock_kernel();
+        while (!kthread_should_stop()) {
+                prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+                spin_lock_bh(&serv->sv_cb_lock);
+                if (!list_empty(&serv->sv_cb_list)) {
+                        req = list_first_entry(&serv->sv_cb_list,
+                                        struct rpc_rqst, rq_bc_list);
+                        list_del(&req->rq_bc_list);
+                        spin_unlock_bh(&serv->sv_cb_lock);
+                        dprintk("Invoking bc_svc_process()\n");
+                        error = bc_svc_process(serv, req, rqstp);
+                        dprintk("bc_svc_process() returned w/ error code= %d\n",
+                                error);
+                } else {
+                        spin_unlock_bh(&serv->sv_cb_lock);
+                        schedule();
+                }
+                finish_wait(&serv->sv_cb_waitq, &wq);
+        }
+        unlock_kernel();
+        return 0;
+}
+/*
+ * Bring up the NFSv4.1 callback service
+ */
+struct svc_rqst *
+nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
+{
+        struct svc_xprt *bc_xprt;
+        struct svc_rqst *rqstp = ERR_PTR(-ENOMEM);
+        dprintk("--> %s\n", __func__);
+        /* Create a svc_sock for the service */
+        bc_xprt = svc_sock_create(serv, xprt->prot);
+        if (!bc_xprt)
+                goto out;
+        /*
+         * Save the svc_serv in the transport so that it can
+         * be referenced when the session backchannel is initialized
+         */
+        serv->bc_xprt = bc_xprt;
+        xprt->bc_serv = serv;
+        INIT_LIST_HEAD(&serv->sv_cb_list);
+        spin_lock_init(&serv->sv_cb_lock);
+        init_waitqueue_head(&serv->sv_cb_waitq);
+        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+        if (IS_ERR(rqstp))
+                svc_sock_destroy(bc_xprt);
+out:
+        dprintk("--> %s return %p\n", __func__, rqstp);
+        return rqstp;
+}
+static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+                struct svc_serv *serv, struct rpc_xprt *xprt,
+                struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+        if (minorversion) {
+                *rqstpp = nfs41_callback_up(serv, xprt);
+                *callback_svc = nfs41_callback_svc;
+        }
+        return minorversion;
+}
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+                struct nfs_callback_data *cb_info)
+{
+        if (minorversion)
+                xprt->bc_serv = cb_info->serv;
+}
+#else
+static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+                struct svc_serv *serv, struct rpc_xprt *xprt,
+                struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+        return 0;
+}
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+                struct nfs_callback_data *cb_info)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+/*
+ * Bring up the callback thread if it is not already up.
+ */
+int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
+{
+        struct svc_serv *serv = NULL;
+        struct svc_rqst *rqstp;
+        int (*callback_svc)(void *vrqstp);
+        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+        char svc_name[12];
+        int ret = 0;
+        int minorversion_setup;
+        mutex_lock(&nfs_callback_mutex);
+        if (cb_info->users++ || cb_info->task != NULL) {
+                nfs_callback_bc_serv(minorversion, xprt, cb_info);
+                goto out;
+        }
+        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+        if (!serv) {
+                ret = -ENOMEM;
+                goto out_err;
+        }
+        minorversion_setup =  nfs_minorversion_callback_svc_setup(minorversion,
+                                        serv, xprt, &rqstp, &callback_svc);
+        if (!minorversion_setup) {
+                /* v4.0 callback setup */
+                rqstp = nfs4_callback_up(serv);
+                callback_svc = nfs4_callback_svc;
+        }
+        if (IS_ERR(rqstp)) {
+                ret = PTR_ERR(rqstp);
                goto out_err;
        }
        svc_sock_update_bufs(serv);
-        nfs_callback_info.task = kthread_run(nfs_callback_svc,
+        sprintf(svc_name, "nfsv4.%u-svc", minorversion);
-                                             nfs_callback_info.rqst,
+        cb_info->serv = serv;
-                                             "nfsv4-svc");
+        cb_info->rqst = rqstp;
-        if (IS_ERR(nfs_callback_info.task)) {
+        cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
-                ret = PTR_ERR(nfs_callback_info.task);
+        if (IS_ERR(cb_info->task)) {
-                svc_exit_thread(nfs_callback_info.rqst);
+                ret = PTR_ERR(cb_info->task);
-                nfs_callback_info.rqst = NULL;
+                svc_exit_thread(cb_info->rqst);
-                nfs_callback_info.task = NULL;
+                cb_info->rqst = NULL;
+                cb_info->task = NULL;
                goto out_err;
        }
 out:
@@ -164,22 +307,25 @@ out:
 out_err:
        dprintk("NFS: Couldn't create callback socket or server thread; "
                "err = %d\n", ret);
-        nfs_callback_info.users--;
+        cb_info->users--;
        goto out;
 }
 /*
 * Kill the callback thread if it's no longer being used.
 */
-void nfs_callback_down(void)
+void nfs_callback_down(int minorversion)
 {
+        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
        mutex_lock(&nfs_callback_mutex);
-        nfs_callback_info.users--;
+        cb_info->users--;
-        if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
+        if (cb_info->users == 0 && cb_info->task != NULL) {
-                kthread_stop(nfs_callback_info.task);
+                kthread_stop(cb_info->task);
-                svc_exit_thread(nfs_callback_info.rqst);
+                svc_exit_thread(cb_info->rqst);
-                nfs_callback_info.rqst = NULL;
+                cb_info->serv = NULL;
-                nfs_callback_info.task = NULL;
+                cb_info->rqst = NULL;
+                cb_info->task = NULL;
        }
        mutex_unlock(&nfs_callback_mutex);
 }
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index e110e286a262..07baa8254ca1 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -20,13 +20,24 @@ enum nfs4_callback_procnum {
 enum nfs4_callback_opnum {
        OP_CB_GETATTR = 3,
        OP_CB_RECALL  = 4,
+/* Callback operations new to NFSv4.1 */
+        OP_CB_LAYOUTRECALL  = 5,
+        OP_CB_NOTIFY        = 6,
+        OP_CB_PUSH_DELEG    = 7,
+        OP_CB_RECALL_ANY    = 8,
+        OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+        OP_CB_RECALL_SLOT   = 10,
+        OP_CB_SEQUENCE      = 11,
+        OP_CB_WANTS_CANCELLED = 12,
+        OP_CB_NOTIFY_LOCK   = 13,
+        OP_CB_NOTIFY_DEVICEID = 14,
        OP_CB_ILLEGAL = 10044,
 };
 struct cb_compound_hdr_arg {
        unsigned int taglen;
        const char *tag;
-        unsigned int callback_ident;
+        unsigned int minorversion;
        unsigned nops;
 };
@@ -59,16 +70,59 @@ struct cb_recallargs {
        uint32_t truncate;
 };
+#if defined(CONFIG_NFS_V4_1)
+struct referring_call {
+        uint32_t                        rc_sequenceid;
+        uint32_t                        rc_slotid;
+};
+struct referring_call_list {
+        struct nfs4_sessionid           rcl_sessionid;
+        uint32_t                        rcl_nrefcalls;
+        struct referring_call           *rcl_refcalls;
+};
+struct cb_sequenceargs {
+        struct sockaddr                 *csa_addr;
+        struct nfs4_sessionid           csa_sessionid;
+        uint32_t                        csa_sequenceid;
+        uint32_t                        csa_slotid;
+        uint32_t                        csa_highestslotid;
+        uint32_t                        csa_cachethis;
+        uint32_t                        csa_nrclists;
+        struct referring_call_list      *csa_rclists;
+};
+struct cb_sequenceres {
+        __be32                          csr_status;
+        struct nfs4_sessionid           csr_sessionid;
+        uint32_t                        csr_sequenceid;
+        uint32_t                        csr_slotid;
+        uint32_t                        csr_highestslotid;
+        uint32_t                        csr_target_highestslotid;
+};
+extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+                                       struct cb_sequenceres *res);
+#endif /* CONFIG_NFS_V4_1 */
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
 extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
 #ifdef CONFIG_NFS_V4
-extern int nfs_callback_up(void);
+extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
-extern void nfs_callback_down(void);
+extern void nfs_callback_down(int minorversion);
-#else
+#endif /* CONFIG_NFS_V4 */
-#define nfs_callback_up()       (0)
-#define nfs_callback_down()     do {} while(0)
+/*
-#endif
+ * nfs41: Callbacks are expected to not cause substantial latency,
+ * so we limit their concurrency to 1 by setting up the maximum number
+ * of slots for the backchannel.
+ */
+#define NFS41_BC_MIN_CALLBACKS 1
+#define NFS41_BC_MAX_CALLBACKS 1
 extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f7e83e23cf9f..b7da1f54da68 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -101,3 +101,130 @@ out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
        return res;
 }
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Validate the sequenceID sent by the server.
+ * Return success if the sequenceID is one more than what we last saw on
+ * this slot, accounting for wraparound.  Increments the slot's sequence.
+ *
+ * We don't yet implement a duplicate request cache, so at this time
+ * we will log replays, and process them as if we had not seen them before,
+ * but we don't bump the sequence in the slot.  Not too worried about it,
+ * since we only currently implement idempotent callbacks anyway.
+ *
+ * We have a single slot backchannel at this time, so we don't bother
+ * checking the used_slots bit array on the table.  The lower layer guarantees
+ * a single outstanding callback request at a time.
+ */
+static int
+validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
+{
+        struct nfs4_slot *slot;
+        dprintk("%s enter. slotid %d seqid %d\n",
+                __func__, slotid, seqid);
+        if (slotid > NFS41_BC_MAX_CALLBACKS)
+                return htonl(NFS4ERR_BADSLOT);
+        slot = tbl->slots + slotid;
+        dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
+        /* Normal */
+        if (likely(seqid == slot->seq_nr + 1)) {
+                slot->seq_nr++;
+                return htonl(NFS4_OK);
+        }
+        /* Replay */
+        if (seqid == slot->seq_nr) {
+                dprintk("%s seqid %d is a replay - no DRC available\n",
+                        __func__, seqid);
+                return htonl(NFS4_OK);
+        }
+        /* Wraparound */
+        if (seqid == 1 && (slot->seq_nr + 1) == 0) {
+                slot->seq_nr = 1;
+                return htonl(NFS4_OK);
+        }
+        /* Misordered request */
+        return htonl(NFS4ERR_SEQ_MISORDERED);
+}
+/*
+ * Returns a pointer to a held 'struct nfs_client' that matches the server's
+ * address, major version number, and session ID.  It is the caller's
+ * responsibility to release the returned reference.
+ *
+ * Returns NULL if there are no connections with sessions, or if no session
+ * matches the one of interest.
+ */
+ static struct nfs_client *find_client_with_session(
+        const struct sockaddr *addr, u32 nfsversion,
+        struct nfs4_sessionid *sessionid)
+{
+        struct nfs_client *clp;
+        clp = nfs_find_client(addr, 4);
+        if (clp == NULL)
+                return NULL;
+        do {
+                struct nfs_client *prev = clp;
+                if (clp->cl_session != NULL) {
+                        if (memcmp(clp->cl_session->sess_id.data,
+                                        sessionid->data,
+                                        NFS4_MAX_SESSIONID_LEN) == 0) {
+                                /* Returns a held reference to clp */
+                                return clp;
+                        }
+                }
+                clp = nfs_find_client_next(prev);
+                nfs_put_client(prev);
+        } while (clp != NULL);
+        return NULL;
+}
+/* FIXME: referring calls should be processed */
+unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+                                struct cb_sequenceres *res)
+{
+        struct nfs_client *clp;
+        int i, status;
+        for (i = 0; i < args->csa_nrclists; i++)
+                kfree(args->csa_rclists[i].rcl_refcalls);
+        kfree(args->csa_rclists);
+        status = htonl(NFS4ERR_BADSESSION);
+        clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
+        if (clp == NULL)
+                goto out;
+        status = validate_seqid(&clp->cl_session->bc_slot_table,
+                                args->csa_slotid, args->csa_sequenceid);
+        if (status)
+                goto out_putclient;
+        memcpy(&res->csr_sessionid, &args->csa_sessionid,
+               sizeof(res->csr_sessionid));
+        res->csr_sequenceid = args->csa_sequenceid;
+        res->csr_slotid = args->csa_slotid;
+        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+out_putclient:
+        nfs_put_client(clp);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        res->csr_status = status;
+        return res->csr_status;
+}
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index dd0ef34b5845..e5a2dac5f715 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -20,6 +20,11 @@
                                2 + 2 + 3 + 3)
 #define CB_OP_RECALL_RES_MAXSZ  (CB_OP_HDR_RES_MAXSZ)
+#if defined(CONFIG_NFS_V4_1)
+#define CB_OP_SEQUENCE_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ + \
+                                        4 + 1 + 3)
+#endif /* CONFIG_NFS_V4_1 */
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 typedef __be32 (*callback_process_op_t)(void *, void *);
@@ -132,7 +137,6 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
 {
        __be32 *p;
-        unsigned int minor_version;
        __be32 status;
        status = decode_string(xdr, &hdr->taglen, &hdr->tag);
@@ -147,15 +151,19 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        p = read_buf(xdr, 12);
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_RESOURCE);
-        minor_version = ntohl(*p++);
+        hdr->minorversion = ntohl(*p++);
-        /* Check minor version is zero. */
+        /* Check minor version is zero or one. */
-        if (minor_version != 0) {
+        if (hdr->minorversion <= 1) {
-                printk(KERN_WARNING "%s: NFSv4 server callback with illegal minor version %u!\n",
+                p++;    /* skip callback_ident */
-                                __func__, minor_version);
+        } else {
+                printk(KERN_WARNING "%s: NFSv4 server callback with "
+                        "illegal minor version %u!\n",
+                        __func__, hdr->minorversion);
                return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
        }
-        hdr->callback_ident = ntohl(*p++);
        hdr->nops = ntohl(*p);
+        dprintk("%s: minorversion %d nops %d\n", __func__,
+                hdr->minorversion, hdr->nops);
        return 0;
 }
@@ -204,6 +212,122 @@ out:
        return status;
 }
+#if defined(CONFIG_NFS_V4_1)
+static unsigned decode_sessionid(struct xdr_stream *xdr,
+                                 struct nfs4_sessionid *sid)
+{
+        uint32_t *p;
+        int len = NFS4_MAX_SESSIONID_LEN;
+        p = read_buf(xdr, len);
+        if (unlikely(p == NULL))
+                return htonl(NFS4ERR_RESOURCE);;
+        memcpy(sid->data, p, len);
+        return 0;
+}
+static unsigned decode_rc_list(struct xdr_stream *xdr,
+                               struct referring_call_list *rc_list)
+{
+        uint32_t *p;
+        int i;
+        unsigned status;
+        status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
+        if (status)
+                goto out;
+        status = htonl(NFS4ERR_RESOURCE);
+        p = read_buf(xdr, sizeof(uint32_t));
+        if (unlikely(p == NULL))
+                goto out;
+        rc_list->rcl_nrefcalls = ntohl(*p++);
+        if (rc_list->rcl_nrefcalls) {
+                p = read_buf(xdr,
+                             rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
+                if (unlikely(p == NULL))
+                        goto out;
+                rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls *
+                                                sizeof(*rc_list->rcl_refcalls),
+                                                GFP_KERNEL);
+                if (unlikely(rc_list->rcl_refcalls == NULL))
+                        goto out;
+                for (i = 0; i < rc_list->rcl_nrefcalls; i++) {
+                        rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++);
+                        rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++);
+                }
+        }
+        status = 0;
+out:
+        return status;
+}
+static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp,
+                                        struct xdr_stream *xdr,
+                                        struct cb_sequenceargs *args)
+{
+        uint32_t *p;
+        int i;
+        unsigned status;
+        status = decode_sessionid(xdr, &args->csa_sessionid);
+        if (status)
+                goto out;
+        status = htonl(NFS4ERR_RESOURCE);
+        p = read_buf(xdr, 5 * sizeof(uint32_t));
+        if (unlikely(p == NULL))
+                goto out;
+        args->csa_addr = svc_addr(rqstp);
+        args->csa_sequenceid = ntohl(*p++);
+        args->csa_slotid = ntohl(*p++);
+        args->csa_highestslotid = ntohl(*p++);
+        args->csa_cachethis = ntohl(*p++);
+        args->csa_nrclists = ntohl(*p++);
+        args->csa_rclists = NULL;
+        if (args->csa_nrclists) {
+                args->csa_rclists = kmalloc(args->csa_nrclists *
+                                            sizeof(*args->csa_rclists),
+                                            GFP_KERNEL);
+                if (unlikely(args->csa_rclists == NULL))
+                        goto out;
+                for (i = 0; i < args->csa_nrclists; i++) {
+                        status = decode_rc_list(xdr, &args->csa_rclists[i]);
+                        if (status)
+                                goto out_free;
+                }
+        }
+        status = 0;
+        dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u "
+                "highestslotid %u cachethis %d nrclists %u\n",
+                __func__,
+                ((u32 *)&args->csa_sessionid)[0],
+                ((u32 *)&args->csa_sessionid)[1],
+                ((u32 *)&args->csa_sessionid)[2],
+                ((u32 *)&args->csa_sessionid)[3],
+                args->csa_sequenceid, args->csa_slotid,
+                args->csa_highestslotid, args->csa_cachethis,
+                args->csa_nrclists);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+out_free:
+        for (i = 0; i < args->csa_nrclists; i++)
+                kfree(args->csa_rclists[i].rcl_refcalls);
+        kfree(args->csa_rclists);
+        goto out;
+}
+#endif /* CONFIG_NFS_V4_1 */
 static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
        __be32 *p;
@@ -353,31 +477,134 @@ out:
        return status;
 }
-static __be32 process_op(struct svc_rqst *rqstp,
+#if defined(CONFIG_NFS_V4_1)
+static unsigned encode_sessionid(struct xdr_stream *xdr,
+                                 const struct nfs4_sessionid *sid)
+{
+        uint32_t *p;
+        int len = NFS4_MAX_SESSIONID_LEN;
+        p = xdr_reserve_space(xdr, len);
+        if (unlikely(p == NULL))
+                return htonl(NFS4ERR_RESOURCE);
+        memcpy(p, sid, len);
+        return 0;
+}
+static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       const struct cb_sequenceres *res)
+{
+        uint32_t *p;
+        unsigned status = res->csr_status;
+        if (unlikely(status != 0))
+                goto out;
+        encode_sessionid(xdr, &res->csr_sessionid);
+        p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
+        if (unlikely(p == NULL))
+                return htonl(NFS4ERR_RESOURCE);
+        *p++ = htonl(res->csr_sequenceid);
+        *p++ = htonl(res->csr_slotid);
+        *p++ = htonl(res->csr_highestslotid);
+        *p++ = htonl(res->csr_target_highestslotid);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+}
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+        if (op_nr == OP_CB_SEQUENCE) {
+                if (nop != 0)
+                        return htonl(NFS4ERR_SEQUENCE_POS);
+        } else {
+                if (nop == 0)
+                        return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        }
+        switch (op_nr) {
+        case OP_CB_GETATTR:
+        case OP_CB_RECALL:
+        case OP_CB_SEQUENCE:
+                *op = &callback_ops[op_nr];
+                break;
+        case OP_CB_LAYOUTRECALL:
+        case OP_CB_NOTIFY_DEVICEID:
+        case OP_CB_NOTIFY:
+        case OP_CB_PUSH_DELEG:
+        case OP_CB_RECALL_ANY:
+        case OP_CB_RECALLABLE_OBJ_AVAIL:
+        case OP_CB_RECALL_SLOT:
+        case OP_CB_WANTS_CANCELLED:
+        case OP_CB_NOTIFY_LOCK:
+                return htonl(NFS4ERR_NOTSUPP);
+        default:
+                return htonl(NFS4ERR_OP_ILLEGAL);
+        }
+        return htonl(NFS_OK);
+}
+#else /* CONFIG_NFS_V4_1 */
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_1 */
+static __be32
+preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
+{
+        switch (op_nr) {
+        case OP_CB_GETATTR:
+        case OP_CB_RECALL:
+                *op = &callback_ops[op_nr];
+                break;
+        default:
+                return htonl(NFS4ERR_OP_ILLEGAL);
+        }
+        return htonl(NFS_OK);
+}
+static __be32 process_op(uint32_t minorversion, int nop,
+                struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
                struct xdr_stream *xdr_out, void *resp)
 {
        struct callback_op *op = &callback_ops[0];
        unsigned int op_nr = OP_CB_ILLEGAL;
-        __be32 status = 0;
+        __be32 status;
        long maxlen;
        __be32 res;
        dprintk("%s: start\n", __func__);
        status = decode_op_hdr(xdr_in, &op_nr);
-        if (likely(status == 0)) {
+        if (unlikely(status)) {
-                switch (op_nr) {
+                status = htonl(NFS4ERR_OP_ILLEGAL);
-                        case OP_CB_GETATTR:
+                goto out;
-                        case OP_CB_RECALL:
-                                op = &callback_ops[op_nr];
-                                break;
-                        default:
-                                op_nr = OP_CB_ILLEGAL;
-                                op = &callback_ops[0];
-                                status = htonl(NFS4ERR_OP_ILLEGAL);
-                }
        }
+        dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
+                __func__, minorversion, nop, op_nr);
+        status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
+                                preprocess_nfs4_op(op_nr, &op);
+        if (status == htonl(NFS4ERR_OP_ILLEGAL))
+                op_nr = OP_CB_ILLEGAL;
+out:
        maxlen = xdr_out->end - xdr_out->p;
        if (maxlen > 0 && maxlen < PAGE_SIZE) {
                if (likely(status == 0 && op->decode_args != NULL))
@@ -425,7 +652,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
                return rpc_system_err;
        while (status == 0 && nops != hdr_arg.nops) {
-                status = process_op(rqstp, &xdr_in, argp, &xdr_out, resp);
+                status = process_op(hdr_arg.minorversion, nops,
+                                    rqstp, &xdr_in, argp, &xdr_out, resp);
                nops++;
        }
@@ -452,7 +680,15 @@ static struct callback_op callback_ops[] = {
                .process_op = (callback_process_op_t)nfs4_callback_recall,
                .decode_args = (callback_decode_arg_t)decode_recall_args,
                .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
-        }
+        },
+#if defined(CONFIG_NFS_V4_1)
+        [OP_CB_SEQUENCE] = {
+                .process_op = (callback_process_op_t)nfs4_callback_sequence,
+                .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
+                .encode_res = (callback_encode_res_t)encode_cb_sequence_res,
+                .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
+        },
+#endif /* CONFIG_NFS_V4_1 */
 };
 /*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 75c9cd2aa119..8d25ccb2d51d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -37,6 +37,7 @@
 #include <linux/in6.h>
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include <asm/system.h>
@@ -102,6 +103,7 @@ struct nfs_client_initdata {
        size_t addrlen;
        const struct nfs_rpc_ops *rpc_ops;
        int proto;
+        u32 minorversion;
 };
 /*
@@ -114,18 +116,13 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 {
        struct nfs_client *clp;
        struct rpc_cred *cred;
+        int err = -ENOMEM;
        if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
                goto error_0;
        clp->rpc_ops = cl_init->rpc_ops;
-        if (cl_init->rpc_ops->version == 4) {
-                if (nfs_callback_up() < 0)
-                        goto error_2;
-                __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
-        }
        atomic_set(&clp->cl_count, 1);
        clp->cl_cons_state = NFS_CS_INITING;
@@ -133,9 +130,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_addrlen = cl_init->addrlen;
        if (cl_init->hostname) {
+                err = -ENOMEM;
                clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
                if (!clp->cl_hostname)
-                        goto error_3;
+                        goto error_cleanup;
        }
        INIT_LIST_HEAD(&clp->cl_superblocks);
@@ -150,6 +148,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
        clp->cl_boot_time = CURRENT_TIME;
        clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
+        clp->cl_minorversion = cl_init->minorversion;
 #endif
        cred = rpc_lookup_machine_cred();
        if (!IS_ERR(cred))
@@ -159,13 +158,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        return clp;
-error_3:
+error_cleanup:
-        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-                nfs_callback_down();
-error_2:
        kfree(clp);
 error_0:
-        return NULL;
+        return ERR_PTR(err);
 }
 static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -182,12 +178,42 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 }
 /*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4
+        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+                nfs_callback_down(clp->cl_minorversion);
+#endif /* CONFIG_NFS_V4 */
+}
+/*
+ * Clears/puts all minor version specific parts from an nfs_client struct
+ * reverting it to minorversion 0.
+ */
+static void nfs4_clear_client_minor_version(struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (nfs4_has_session(clp)) {
+                nfs4_destroy_session(clp->cl_session);
+                clp->cl_session = NULL;
+        }
+        clp->cl_call_sync = _nfs4_call_sync;
+#endif /* CONFIG_NFS_V4_1 */
+        nfs4_destroy_callback(clp);
+}
+/*
 * Destroy a shared client record
 */
 static void nfs_free_client(struct nfs_client *clp)
 {
        dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
+        nfs4_clear_client_minor_version(clp);
        nfs4_shutdown_client(clp);
        nfs_fscache_release_client_cookie(clp);
@@ -196,9 +222,6 @@ static void nfs_free_client(struct nfs_client *clp)
        if (!IS_ERR(clp->cl_rpcclient))
                rpc_shutdown_client(clp->cl_rpcclient);
-        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-                nfs_callback_down();
        if (clp->cl_machine_cred != NULL)
                put_rpccred(clp->cl_machine_cred);
@@ -347,7 +370,8 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
                /* Don't match clients that failed to initialise properly */
-                if (clp->cl_cons_state != NFS_CS_READY)
+                if (!(clp->cl_cons_state == NFS_CS_READY ||
+                      clp->cl_cons_state == NFS_CS_SESSION_INITING))
                        continue;
                /* Different NFS versions cannot share the same nfs_client */
@@ -420,7 +444,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
                if (clp->cl_proto != data->proto)
                        continue;
+                /* Match nfsv4 minorversion */
+                if (clp->cl_minorversion != data->minorversion)
+                        continue;
                /* Match the full socket address */
                if (!nfs_sockaddr_cmp(sap, clap))
                        continue;
@@ -456,9 +482,10 @@ static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_in
                spin_unlock(&nfs_client_lock);
                new = nfs_alloc_client(cl_init);
-        } while (new);
+        } while (!IS_ERR(new));
-        return ERR_PTR(-ENOMEM);
+        dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new));
+        return new;
        /* install a new client and return with it unready */
 install_client:
@@ -478,7 +505,7 @@ found_client:
                nfs_free_client(new);
        error = wait_event_killable(nfs_client_active_wq,
-                                clp->cl_cons_state != NFS_CS_INITING);
+                                clp->cl_cons_state < NFS_CS_INITING);
        if (error < 0) {
                nfs_put_client(clp);
                return ERR_PTR(-ERESTARTSYS);
@@ -499,13 +526,29 @@ found_client:
 /*
 * Mark a server as ready or failed
 */
-static void nfs_mark_client_ready(struct nfs_client *clp, int state)
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
 {
        clp->cl_cons_state = state;
        wake_up_all(&nfs_client_active_wq);
 }
 /*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+int nfs4_check_client_ready(struct nfs_client *clp)
+{
+        if (!nfs4_has_session(clp))
+                return 0;
+        if (clp->cl_cons_state < NFS_CS_READY)
+                return -EPROTONOSUPPORT;
+        return 0;
+}
+/*
 * Initialise the timeout values for a connection
 */
 static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
@@ -1050,6 +1093,61 @@ error:
 #ifdef CONFIG_NFS_V4
 /*
+ * Initialize the NFS4 callback service
+ */
+static int nfs4_init_callback(struct nfs_client *clp)
+{
+        int error;
+        if (clp->rpc_ops->version == 4) {
+                if (nfs4_has_session(clp)) {
+                        error = xprt_setup_backchannel(
+                                                clp->cl_rpcclient->cl_xprt,
+                                                NFS41_BC_MIN_CALLBACKS);
+                        if (error < 0)
+                                return error;
+                }
+                error = nfs_callback_up(clp->cl_minorversion,
+                                        clp->cl_rpcclient->cl_xprt);
+                if (error < 0) {
+                        dprintk("%s: failed to start callback. Error = %d\n",
+                                __func__, error);
+                        return error;
+                }
+                __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+        }
+        return 0;
+}
+/*
+ * Initialize the minor version specific parts of an NFS4 client record
+ */
+static int nfs4_init_client_minor_version(struct nfs_client *clp)
+{
+        clp->cl_call_sync = _nfs4_call_sync;
+#if defined(CONFIG_NFS_V4_1)
+        if (clp->cl_minorversion) {
+                struct nfs4_session *session = NULL;
+                /*
+                 * Create the session and mark it expired.
+                 * When a SEQUENCE operation encounters the expired session
+                 * it will do session recovery to initialize it.
+                 */
+                session = nfs4_alloc_session(clp);
+                if (!session)
+                        return -ENOMEM;
+                clp->cl_session = session;
+                clp->cl_call_sync = _nfs4_call_sync_session;
+        }
+#endif /* CONFIG_NFS_V4_1 */
+        return nfs4_init_callback(clp);
+}
+/*
 * Initialise an NFS4 client record
 */
 static int nfs4_init_client(struct nfs_client *clp,
@@ -1083,7 +1181,12 @@ static int nfs4_init_client(struct nfs_client *clp,
        }
        __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
-        nfs_mark_client_ready(clp, NFS_CS_READY);
+        error = nfs4_init_client_minor_version(clp);
+        if (error < 0)
+                goto error;
+        if (!nfs4_has_session(clp))
+                nfs_mark_client_ready(clp, NFS_CS_READY);
        return 0;
 error:
@@ -1101,7 +1204,8 @@ static int nfs4_set_client(struct nfs_server *server,
                const size_t addrlen,
                const char *ip_addr,
                rpc_authflavor_t authflavour,
-                int proto, const struct rpc_timeout *timeparms)
+                int proto, const struct rpc_timeout *timeparms,
+                u32 minorversion)
 {
        struct nfs_client_initdata cl_init = {
                .hostname = hostname,
@@ -1109,6 +1213,7 @@ static int nfs4_set_client(struct nfs_server *server,
                .addrlen = addrlen,
                .rpc_ops = &nfs_v4_clientops,
                .proto = proto,
+                .minorversion = minorversion,
        };
        struct nfs_client *clp;
        int error;
@@ -1137,6 +1242,22 @@ error:
        return error;
 }
+/*
+ * Session has been established, and the client marked ready.
+ * Set the mount rsize and wsize with negotiated fore channel
+ * attributes which will be bound checked in nfs_server_set_fsinfo.
+ */
+static void nfs4_session_set_rwsize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (!nfs4_has_session(server->nfs_client))
+                return;
+        server->rsize = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        server->wsize = server->nfs_client->cl_session->fc_attrs.max_rqst_sz;
+#endif /* CONFIG_NFS_V4_1 */
+}
 /*
 * Create a version 4 volume record
 */
@@ -1164,7 +1285,8 @@ static int nfs4_init_server(struct nfs_server *server,
                        data->client_address,
                        data->auth_flavors[0],
                        data->nfs_server.protocol,
-                        &timeparms);
+                        &timeparms,
+                        data->minorversion);
        if (error < 0)
                goto error;
@@ -1214,6 +1336,10 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        BUG_ON(!server->nfs_client->rpc_ops);
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        error = nfs4_init_session(server);
+        if (error < 0)
+                goto error;
        /* Probe the root fh to retrieve its FSID */
        error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
        if (error < 0)
@@ -1224,6 +1350,8 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
                (unsigned long long) server->fsid.minor);
        dprintk("Mount FH: %d\n", mntfh->size);
+        nfs4_session_set_rwsize(server);
        error = nfs_probe_fsinfo(server, mntfh, &fattr);
        if (error < 0)
                goto error;
@@ -1282,7 +1410,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
                                parent_client->cl_ipaddr,
                                data->authflavor,
                                parent_server->client->cl_xprt->prot,
-                                parent_server->client->cl_timeout);
+                                parent_server->client->cl_timeout,
+                                parent_client->cl_minorversion);
        if (error < 0)
                goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 968225a88015..6dd48a4405b4 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/nfs4.h>
@@ -68,29 +69,26 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 {
        struct inode *inode = state->inode;
        struct file_lock *fl;
-        int status;
+        int status = 0;
+        if (inode->i_flock == NULL)
+                goto out;
+        /* Protect inode->i_flock using the BKL */
+        lock_kernel();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file) != ctx)
                        continue;
+                unlock_kernel();
                status = nfs4_lock_delegation_recall(state, fl);
-                if (status >= 0)
+                if (status < 0)
-                        continue;
+                        goto out;
-                switch (status) {
+                lock_kernel();
-                        default:
-                                printk(KERN_ERR "%s: unhandled error %d.\n",
-                                                __func__, status);
-                        case -NFS4ERR_EXPIRED:
-                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
-                        case -NFS4ERR_STALE_CLIENTID:
-                                nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs_client);
-                                goto out_err;
-                }
        }
-        return 0;
+        unlock_kernel();
-out_err:
+out:
        return status;
 }
@@ -268,7 +266,10 @@ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegat
        struct nfs_inode *nfsi = NFS_I(inode);
        nfs_msync_inode(inode);
-        /* Guard against new delegated open calls */
+        /*
+         * Guard against new delegated open/lock/unlock calls and against
+         * state recovery
+         */
        down_write(&nfsi->rwsem);
        nfs_delegation_claim_opens(inode, &delegation->stateid);
        up_write(&nfsi->rwsem);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 89f98e9a024b..32062c33c859 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -29,7 +29,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/pagevec.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
@@ -1026,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
-                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
+                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                goto out;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 08f6b040d289..e4e089a8f294 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,10 +255,13 @@ static void nfs_direct_read_release(void *calldata)
        if (put_dreq(dreq))
                nfs_direct_complete(dreq);
-        nfs_readdata_release(calldata);
+        nfs_readdata_free(data);
 }
 static const struct rpc_call_ops nfs_read_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_direct_read_result,
        .rpc_release = nfs_direct_read_release,
 };
@@ -311,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                                        data->npages, 1, 0, data->pagevec, NULL);
                up_read(&current->mm->mmap_sem);
                if (result < 0) {
-                        nfs_readdata_release(data);
+                        nfs_readdata_free(data);
                        break;
                }
                if ((unsigned)result < data->npages) {
                        bytes = result * PAGE_SIZE;
                        if (bytes <= pgbase) {
                                nfs_direct_release_pages(data->pagevec, result);
-                                nfs_readdata_release(data);
+                                nfs_readdata_free(data);
                                break;
                        }
                        bytes -= pgbase;
@@ -331,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                data->inode = inode;
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
-                data->args.context = get_nfs_open_context(ctx);
+                data->args.context = ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
@@ -438,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
                struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
                list_del(&data->pages);
                nfs_direct_release_pages(data->pagevec, data->npages);
-                nfs_writedata_release(data);
+                nfs_writedata_free(data);
        }
 }
@@ -531,10 +534,13 @@ static void nfs_direct_commit_release(void *calldata)
        dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
        nfs_direct_write_complete(dreq, data->inode);
-        nfs_commitdata_release(calldata);
+        nfs_commit_free(data);
 }
 static const struct rpc_call_ops nfs_commit_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_direct_commit_result,
        .rpc_release = nfs_direct_commit_release,
 };
@@ -564,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
        data->args.fh = NFS_FH(data->inode);
        data->args.offset = 0;
        data->args.count = 0;
-        data->args.context = get_nfs_open_context(dreq->ctx);
+        data->args.context = dreq->ctx;
        data->res.count = 0;
        data->res.fattr = &data->fattr;
        data->res.verf = &data->verf;
@@ -673,6 +679,9 @@ out_unlock:
 }
 static const struct rpc_call_ops nfs_write_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_direct_write_result,
        .rpc_release = nfs_direct_write_release,
 };
@@ -725,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                                        data->npages, 0, 0, data->pagevec, NULL);
                up_read(&current->mm->mmap_sem);
                if (result < 0) {
-                        nfs_writedata_release(data);
+                        nfs_writedata_free(data);
                        break;
                }
                if ((unsigned)result < data->npages) {
                        bytes = result * PAGE_SIZE;
                        if (bytes <= pgbase) {
                                nfs_direct_release_pages(data->pagevec, result);
-                                nfs_writedata_release(data);
+                                nfs_writedata_free(data);
                                break;
                        }
                        bytes -= pgbase;
@@ -747,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                data->inode = inode;
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
-                data->args.context = get_nfs_open_context(ctx);
+                data->args.context = ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ec7e27d00bc6..05062329b678 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -26,7 +26,6 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/aio.h>
 #include <asm/uaccess.h>
@@ -48,6 +47,9 @@ static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
                                        size_t count, unsigned int flags);
 static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
+static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
+                                        struct file *filp, loff_t *ppos,
+                                        size_t count, unsigned int flags);
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
@@ -73,6 +75,7 @@ const struct file_operations nfs_file_operations = {
        .lock           = nfs_lock,
        .flock          = nfs_flock,
        .splice_read    = nfs_file_splice_read,
+        .splice_write   = nfs_file_splice_write,
        .check_flags    = nfs_check_flags,
        .setlease       = nfs_setlease,
 };
@@ -587,12 +590,38 @@ out_swapfile:
        goto out;
 }
+static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
+                                     struct file *filp, loff_t *ppos,
+                                     size_t count, unsigned int flags)
+{
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        ssize_t ret;
+        dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
+                dentry->d_parent->d_name.name, dentry->d_name.name,
+                (unsigned long) count, (unsigned long long) *ppos);
+        /*
+         * The combination of splice and an O_APPEND destination is disallowed.
+         */
+        nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
+        ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
+        if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
+                int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
+                if (err < 0)
+                        ret = err;
+        }
+        return ret;
+}
 static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
 {
        struct inode *inode = filp->f_mapping->host;
        int status = 0;
-        lock_kernel();
        /* Try local locking first */
        posix_test_lock(filp, fl);
        if (fl->fl_type != F_UNLCK) {
@@ -608,7 +637,6 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
        status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 out:
-        unlock_kernel();
        return status;
 out_noconflict:
        fl->fl_type = F_UNLCK;
@@ -650,13 +678,11 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
         *      If we're signalled while cleaning up locks on process exit, we
         *      still need to complete the unlock.
         */
-        lock_kernel();
        /* Use local locking if mounted with "-onolock" */
        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
-        unlock_kernel();
        return status;
 }
@@ -673,13 +699,11 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
        if (status != 0)
                goto out;
-        lock_kernel();
        /* Use local locking if mounted with "-onolock" */
        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
-        unlock_kernel();
        if (status < 0)
                goto out;
        /*
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 46177cb87064..b35d2a616066 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -30,7 +30,6 @@
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/namei.h>
-#include <linux/mnt_namespace.h>
 #include <linux/security.h>
 #include <asm/system.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 64f87194d390..bd7938eda6a8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -30,7 +30,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e4d6a8348adf..7dd90a6769d0 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -2,6 +2,7 @@
 * NFS internal definitions
 */
+#include "nfs4_fs.h"
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -17,6 +18,18 @@ struct nfs_string;
 */
 #define NFS_MAX_READAHEAD       (RPC_DEF_SLOT_TABLE - 1)
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (clp->cl_session)
+                return 1;
+#endif /* CONFIG_NFS_V4_1 */
+        return 0;
+}
 struct nfs_clone_mount {
        const struct super_block *sb;
        const struct dentry *dentry;
@@ -30,6 +43,12 @@ struct nfs_clone_mount {
 };
 /*
+ * Note: RFC 1813 doesn't limit the number of auth flavors that
+ * a server can return, so make something up.
+ */
+#define NFS_MAX_SECFLAVORS      (12)
+/*
 * In-kernel mount arguments
 */
 struct nfs_parsed_mount_data {
@@ -44,6 +63,7 @@ struct nfs_parsed_mount_data {
        unsigned int            auth_flavor_len;
        rpc_authflavor_t        auth_flavors[1];
        char                    *client_address;
+        unsigned int            minorversion;
        char                    *fscache_uniq;
        struct {
@@ -77,6 +97,8 @@ struct nfs_mount_request {
        unsigned short          protocol;
        struct nfs_fh           *fh;
        int                     noresvport;
+        unsigned int            *auth_flav_len;
+        rpc_authflavor_t        *auth_flavs;
 };
 extern int nfs_mount(struct nfs_mount_request *info);
@@ -99,6 +121,8 @@ extern void nfs_free_server(struct nfs_server *server);
 extern struct nfs_server *nfs_clone_server(struct nfs_server *,
                                           struct nfs_fh *,
                                           struct nfs_fattr *);
+extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
+extern int nfs4_check_client_ready(struct nfs_client *clp);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -146,6 +170,20 @@ extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
 extern struct rpc_procinfo nfs3_procedures[];
 extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
+/* nfs4proc.c */
+static inline void nfs4_restart_rpc(struct rpc_task *task,
+                                    const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (nfs4_has_session(clp) &&
+            test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
+                rpc_restart_call_prepare(task);
+                return;
+        }
+#endif /* CONFIG_NFS_V4_1 */
+        rpc_restart_call(task);
+}
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
 extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
@@ -205,6 +243,38 @@ extern int nfs4_path_walk(struct nfs_server *server,
                          const char *path);
 #endif
+/* read.c */
+extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+/* write.c */
+extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+/* nfs4proc.c */
+extern int _nfs4_call_sync(struct nfs_server *server,
+                           struct rpc_message *msg,
+                           struct nfs4_sequence_args *args,
+                           struct nfs4_sequence_res *res,
+                           int cache_reply);
+extern int _nfs4_call_sync_session(struct nfs_server *server,
+                                   struct rpc_message *msg,
+                                   struct nfs4_sequence_args *args,
+                                   struct nfs4_sequence_res *res,
+                                   int cache_reply);
+#ifdef CONFIG_NFS_V4_1
+extern void nfs41_sequence_free_slot(const struct nfs_client *,
+                                     struct nfs4_sequence_res *res);
+#endif /* CONFIG_NFS_V4_1 */
+static inline void nfs4_sequence_free_slot(const struct nfs_client *clp,
+                                           struct nfs4_sequence_res *res)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (nfs4_has_session(clp))
+                nfs41_sequence_free_slot(clp, res);
+#endif /* CONFIG_NFS_V4_1 */
+}
 /*
 * Determine the device name as a string
 */
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a2ab2529b5ca..ceda50aad73c 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -31,7 +31,7 @@ static inline void nfs_inc_server_stats(const struct nfs_server *server,
        cpu = get_cpu();
        iostats = per_cpu_ptr(server->io_stats, cpu);
        iostats->events[stat]++;
-        put_cpu_no_resched();
+        put_cpu();
 }
 static inline void nfs_inc_stats(const struct inode *inode,
@@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server,
        cpu = get_cpu();
        iostats = per_cpu_ptr(server->io_stats, cpu);
        iostats->bytes[stat] += addend;
-        put_cpu_no_resched();
+        put_cpu();
 }
 static inline void nfs_add_stats(const struct inode *inode,
@@ -71,7 +71,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
        cpu = get_cpu();
        iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
        iostats->fscache[stat] += addend;
-        put_cpu_no_resched();
+        put_cpu();
 }
 #endif
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index ca905a5bb1ba..38ef9eaec407 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -20,8 +20,116 @@
 # define NFSDBG_FACILITY        NFSDBG_MOUNT
 #endif
+/*
+ * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
+ */
+#define MNTPATHLEN              (1024)
+/*
+ * XDR data type sizes
+ */
+#define encode_dirpath_sz       (1 + XDR_QUADLEN(MNTPATHLEN))
+#define MNT_status_sz           (1)
+#define MNT_fhs_status_sz       (1)
+#define MNT_fhandle_sz          XDR_QUADLEN(NFS2_FHSIZE)
+#define MNT_fhandle3_sz         (1 + XDR_QUADLEN(NFS3_FHSIZE))
+#define MNT_authflav3_sz        (1 + NFS_MAX_SECFLAVORS)
+/*
+ * XDR argument and result sizes
+ */
+#define MNT_enc_dirpath_sz      encode_dirpath_sz
+#define MNT_dec_mountres_sz     (MNT_status_sz + MNT_fhandle_sz)
+#define MNT_dec_mountres3_sz    (MNT_status_sz + MNT_fhandle_sz + \
+                                 MNT_authflav3_sz)
+/*
+ * Defined by RFC 1094, section A.5
+ */
+enum {
+        MOUNTPROC_NULL          = 0,
+        MOUNTPROC_MNT           = 1,
+        MOUNTPROC_DUMP          = 2,
+        MOUNTPROC_UMNT          = 3,
+        MOUNTPROC_UMNTALL       = 4,
+        MOUNTPROC_EXPORT        = 5,
+};
+/*
+ * Defined by RFC 1813, section 5.2
+ */
+enum {
+        MOUNTPROC3_NULL         = 0,
+        MOUNTPROC3_MNT          = 1,
+        MOUNTPROC3_DUMP         = 2,
+        MOUNTPROC3_UMNT         = 3,
+        MOUNTPROC3_UMNTALL      = 4,
+        MOUNTPROC3_EXPORT       = 5,
+};
 static struct rpc_program       mnt_program;
+/*
+ * Defined by OpenGroup XNFS Version 3W, chapter 8
+ */
+enum mountstat {
+        MNT_OK                  = 0,
+        MNT_EPERM               = 1,
+        MNT_ENOENT              = 2,
+        MNT_EACCES              = 13,
+        MNT_EINVAL              = 22,
+};
+static struct {
+        u32 status;
+        int errno;
+} mnt_errtbl[] = {
+        { .status = MNT_OK,                     .errno = 0,             },
+        { .status = MNT_EPERM,                  .errno = -EPERM,        },
+        { .status = MNT_ENOENT,                 .errno = -ENOENT,       },
+        { .status = MNT_EACCES,                 .errno = -EACCES,       },
+        { .status = MNT_EINVAL,                 .errno = -EINVAL,       },
+};
+/*
+ * Defined by RFC 1813, section 5.1.5
+ */
+enum mountstat3 {
+        MNT3_OK                 = 0,            /* no error */
+        MNT3ERR_PERM            = 1,            /* Not owner */
+        MNT3ERR_NOENT           = 2,            /* No such file or directory */
+        MNT3ERR_IO              = 5,            /* I/O error */
+        MNT3ERR_ACCES           = 13,           /* Permission denied */
+        MNT3ERR_NOTDIR          = 20,           /* Not a directory */
+        MNT3ERR_INVAL           = 22,           /* Invalid argument */
+        MNT3ERR_NAMETOOLONG     = 63,           /* Filename too long */
+        MNT3ERR_NOTSUPP         = 10004,        /* Operation not supported */
+        MNT3ERR_SERVERFAULT     = 10006,        /* A failure on the server */
+};
+static struct {
+        u32 status;
+        int errno;
+} mnt3_errtbl[] = {
+        { .status = MNT3_OK,                    .errno = 0,             },
+        { .status = MNT3ERR_PERM,               .errno = -EPERM,        },
+        { .status = MNT3ERR_NOENT,              .errno = -ENOENT,       },
+        { .status = MNT3ERR_IO,                 .errno = -EIO,          },
+        { .status = MNT3ERR_ACCES,              .errno = -EACCES,       },
+        { .status = MNT3ERR_NOTDIR,             .errno = -ENOTDIR,      },
+        { .status = MNT3ERR_INVAL,              .errno = -EINVAL,       },
+        { .status = MNT3ERR_NAMETOOLONG,        .errno = -ENAMETOOLONG, },
+        { .status = MNT3ERR_NOTSUPP,            .errno = -ENOTSUPP,     },
+        { .status = MNT3ERR_SERVERFAULT,        .errno = -ESERVERFAULT, },
+};
+struct mountres {
+        int errno;
+        struct nfs_fh *fh;
+        unsigned int *auth_count;
+        rpc_authflavor_t *auth_flavors;
+};
 struct mnt_fhstatus {
        u32 status;
        struct nfs_fh *fh;
@@ -35,8 +143,10 @@ struct mnt_fhstatus {
 */
 int nfs_mount(struct nfs_mount_request *info)
 {
-        struct mnt_fhstatus     result = {
+        struct mountres result = {
-                .fh             = info->fh
+                .fh             = info->fh,
+                .auth_count     = info->auth_flav_len,
+                .auth_flavors   = info->auth_flavs,
        };
        struct rpc_message msg  = {
                .rpc_argp       = info->dirpath,
@@ -68,14 +178,14 @@ int nfs_mount(struct nfs_mount_request *info)
        if (info->version == NFS_MNT3_VERSION)
                msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
        else
-                msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
+                msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
        status = rpc_call_sync(mnt_clnt, &msg, 0);
        rpc_shutdown_client(mnt_clnt);
        if (status < 0)
                goto out_call_err;
-        if (result.status != 0)
+        if (result.errno != 0)
                goto out_mnt_err;
        dprintk("NFS: MNT request succeeded\n");
@@ -86,72 +196,215 @@ out:
 out_clnt_err:
        status = PTR_ERR(mnt_clnt);
-        dprintk("NFS: failed to create RPC client, status=%d\n", status);
+        dprintk("NFS: failed to create MNT RPC client, status=%d\n", status);
        goto out;
 out_call_err:
-        dprintk("NFS: failed to start MNT request, status=%d\n", status);
+        dprintk("NFS: MNT request failed, status=%d\n", status);
        goto out;
 out_mnt_err:
-        dprintk("NFS: MNT server returned result %d\n", result.status);
+        dprintk("NFS: MNT server returned result %d\n", result.errno);
-        status = nfs_stat_to_errno(result.status);
+        status = result.errno;
        goto out;
 }
 /*
 * XDR encode/decode functions for MOUNT
 */
-static int xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p,
-                              const char *path)
+static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+{
+        const u32 pathname_len = strlen(pathname);
+        __be32 *p;
+        if (unlikely(pathname_len > MNTPATHLEN))
+                return -EIO;
+        p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
+        if (unlikely(p == NULL))
+                return -EIO;
+        xdr_encode_opaque(p, pathname, pathname_len);
+        return 0;
+}
+static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
+                           const char *dirpath)
+{
+        struct xdr_stream xdr;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        return encode_mntdirpath(&xdr, dirpath);
+}
+/*
+ * RFC 1094: "A non-zero status indicates some sort of error.  In this
+ * case, the status is a UNIX error number."  This can be problematic
+ * if the server and client use different errno values for the same
+ * error.
+ *
+ * However, the OpenGroup XNFS spec provides a simple mapping that is
+ * independent of local errno values on the server and the client.
+ */
+static int decode_status(struct xdr_stream *xdr, struct mountres *res)
 {
-        p = xdr_encode_string(p, path);
+        unsigned int i;
+        u32 status;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, sizeof(status));
+        if (unlikely(p == NULL))
+                return -EIO;
+        status = ntohl(*p);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) {
+                if (mnt_errtbl[i].status == status) {
+                        res->errno = mnt_errtbl[i].errno;
+                        return 0;
+                }
+        }
+        dprintk("NFS: unrecognized MNT status code: %u\n", status);
+        res->errno = -EACCES;
        return 0;
 }
-static int xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p,
+static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
-                               struct mnt_fhstatus *res)
 {
        struct nfs_fh *fh = res->fh;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+        if (unlikely(p == NULL))
+                return -EIO;
+        fh->size = NFS2_FHSIZE;
+        memcpy(fh->data, p, NFS2_FHSIZE);
+        return 0;
+}
+static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
+                            struct mountres *res)
+{
+        struct xdr_stream xdr;
+        int status;
+        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_status(&xdr, res);
+        if (unlikely(status != 0 || res->errno != 0))
+                return status;
+        return decode_fhandle(&xdr, res);
+}
+static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
+{
+        unsigned int i;
+        u32 status;
+        __be32 *p;
-        if ((res->status = ntohl(*p++)) == 0) {
+        p = xdr_inline_decode(xdr, sizeof(status));
-                fh->size = NFS2_FHSIZE;
+        if (unlikely(p == NULL))
-                memcpy(fh->data, p, NFS2_FHSIZE);
+                return -EIO;
+        status = ntohl(*p);
+        for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) {
+                if (mnt3_errtbl[i].status == status) {
+                        res->errno = mnt3_errtbl[i].errno;
+                        return 0;
+                }
        }
+        dprintk("NFS: unrecognized MNT3 status code: %u\n", status);
+        res->errno = -EACCES;
        return 0;
 }
-static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
+static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
-                                struct mnt_fhstatus *res)
 {
        struct nfs_fh *fh = res->fh;
-        unsigned size;
+        u32 size;
+        __be32 *p;
-        if ((res->status = ntohl(*p++)) == 0) {
-                size = ntohl(*p++);
+        p = xdr_inline_decode(xdr, sizeof(size));
-                if (size <= NFS3_FHSIZE && size != 0) {
+        if (unlikely(p == NULL))
-                        fh->size = size;
+                return -EIO;
-                        memcpy(fh->data, p, size);
-                } else
+        size = ntohl(*p++);
-                        res->status = -EBADHANDLE;
+        if (size > NFS3_FHSIZE || size == 0)
+                return -EIO;
+        p = xdr_inline_decode(xdr, size);
+        if (unlikely(p == NULL))
+                return -EIO;
+        fh->size = size;
+        memcpy(fh->data, p, size);
+        return 0;
+}
+static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
+{
+        rpc_authflavor_t *flavors = res->auth_flavors;
+        unsigned int *count = res->auth_count;
+        u32 entries, i;
+        __be32 *p;
+        if (*count == 0)
+                return 0;
+        p = xdr_inline_decode(xdr, sizeof(entries));
+        if (unlikely(p == NULL))
+                return -EIO;
+        entries = ntohl(*p);
+        dprintk("NFS: received %u auth flavors\n", entries);
+        if (entries > NFS_MAX_SECFLAVORS)
+                entries = NFS_MAX_SECFLAVORS;
+        p = xdr_inline_decode(xdr, sizeof(u32) * entries);
+        if (unlikely(p == NULL))
+                return -EIO;
+        if (entries > *count)
+                entries = *count;
+        for (i = 0; i < entries; i++) {
+                flavors[i] = ntohl(*p++);
+                dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]);
        }
+        *count = i;
        return 0;
 }
-#define MNT_dirpath_sz          (1 + 256)
+static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
-#define MNT_fhstatus_sz         (1 + 8)
+                             struct mountres *res)
-#define MNT_fhstatus3_sz        (1 + 16)
+{
+        struct xdr_stream xdr;
+        int status;
+        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_fhs_status(&xdr, res);
+        if (unlikely(status != 0 || res->errno != 0))
+                return status;
+        status = decode_fhandle3(&xdr, res);
+        if (unlikely(status != 0)) {
+                res->errno = -EBADHANDLE;
+                return 0;
+        }
+        return decode_auth_flavors(&xdr, res);
+}
 static struct rpc_procinfo mnt_procedures[] = {
-        [MNTPROC_MNT] = {
+        [MOUNTPROC_MNT] = {
-                .p_proc         = MNTPROC_MNT,
+                .p_proc         = MOUNTPROC_MNT,
-                .p_encode       = (kxdrproc_t) xdr_encode_dirpath,
+                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
-                .p_decode       = (kxdrproc_t) xdr_decode_fhstatus,
+                .p_decode       = (kxdrproc_t)mnt_dec_mountres,
-                .p_arglen       = MNT_dirpath_sz,
+                .p_arglen       = MNT_enc_dirpath_sz,
-                .p_replen       = MNT_fhstatus_sz,
+                .p_replen       = MNT_dec_mountres_sz,
-                .p_statidx      = MNTPROC_MNT,
+                .p_statidx      = MOUNTPROC_MNT,
                .p_name         = "MOUNT",
        },
 };
@@ -159,10 +412,10 @@ static struct rpc_procinfo mnt_procedures[] = {
 static struct rpc_procinfo mnt3_procedures[] = {
        [MOUNTPROC3_MNT] = {
                .p_proc         = MOUNTPROC3_MNT,
-                .p_encode       = (kxdrproc_t) xdr_encode_dirpath,
+                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
-                .p_decode       = (kxdrproc_t) xdr_decode_fhstatus3,
+                .p_decode       = (kxdrproc_t)mnt_dec_mountres3,
-                .p_arglen       = MNT_dirpath_sz,
+                .p_arglen       = MNT_enc_dirpath_sz,
-                .p_replen       = MNT_fhstatus3_sz,
+                .p_replen       = MNT_dec_mountres3_sz,
                .p_statidx      = MOUNTPROC3_MNT,
                .p_name         = "MOUNT",
        },
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 64a288ee046d..40c766782891 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -65,6 +65,11 @@ char *nfs_path(const char *base,
                dentry = dentry->d_parent;
        }
        spin_unlock(&dcache_lock);
+        if (*end != '/') {
+                if (--buflen < 0)
+                        goto Elong;
+                *--end = '/';
+        }
        namelen = strlen(base);
        /* Strip off excess slashes in base string */
        while (namelen > 0 && base[namelen - 1] == '/')
@@ -154,7 +159,7 @@ out_err:
        goto out;
 out_follow:
        while (d_mountpoint(nd->path.dentry) &&
-               follow_down(&nd->path.mnt, &nd->path.dentry))
+               follow_down(&nd->path))
                ;
        err = 0;
        goto out;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 6bbf0e6daad2..bac60515a4b3 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -207,8 +207,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        status = nfs_revalidate_inode(server, inode);
        if (status < 0)
                return ERR_PTR(status);
-        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
-                nfs_zap_acl_cache(inode);
        acl = nfs3_get_cached_acl(inode, type);
        if (acl != ERR_PTR(-EAGAIN))
                return acl;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 84345deab26f..6ea07a3c75d4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_REBOOT,
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
+        NFS4CLNT_SESSION_SETUP,
 };
 /*
@@ -177,6 +178,14 @@ struct nfs4_state_recovery_ops {
        int state_flag_bit;
        int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
        int (*recover_lock)(struct nfs4_state *, struct file_lock *);
+        int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
+        struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
+};
+struct nfs4_state_maintenance_ops {
+        int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *);
+        struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
+        int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
 };
 extern const struct dentry_operations nfs4_dentry_operations;
@@ -193,6 +202,7 @@ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struc
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -200,8 +210,32 @@ extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fh
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
-extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
+extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[];
-extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
+extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[];
+#if defined(CONFIG_NFS_V4_1)
+extern int nfs4_setup_sequence(struct nfs_client *clp,
+                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+                int cache_reply, struct rpc_task *task);
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern int nfs4_proc_create_session(struct nfs_client *, int reset);
+extern int nfs4_proc_destroy_session(struct nfs4_session *);
+extern int nfs4_init_session(struct nfs_server *server);
+#else /* CONFIG_NFS_v4_1 */
+static inline int nfs4_setup_sequence(struct nfs_client *clp,
+                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+                int cache_reply, struct rpc_task *task)
+{
+        return 0;
+}
+static inline int nfs4_init_session(struct nfs_server *server)
+{
+        return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
+extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
@@ -216,7 +250,12 @@ extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(struct work_struct *);
 /* nfs4state.c */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
+#if defined(CONFIG_NFS_V4_1)
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4674f8092da8..6917311f201c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -45,14 +45,16 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
-#include <linux/smp_lock.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "callback.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
@@ -247,7 +249,25 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                        ret = nfs4_wait_clnt_recover(clp);
                        if (ret == 0)
                                exception->retry = 1;
+#if !defined(CONFIG_NFS_V4_1)
                        break;
+#else /* !defined(CONFIG_NFS_V4_1) */
+                        if (!nfs4_has_session(server->nfs_client))
+                                break;
+                        /* FALLTHROUGH */
+                case -NFS4ERR_BADSESSION:
+                case -NFS4ERR_BADSLOT:
+                case -NFS4ERR_BAD_HIGH_SLOT:
+                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                case -NFS4ERR_DEADSESSION:
+                case -NFS4ERR_SEQ_FALSE_RETRY:
+                case -NFS4ERR_SEQ_MISORDERED:
+                        dprintk("%s ERROR: %d Reset session\n", __func__,
+                                errorcode);
+                        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+                        exception->retry = 1;
+                        /* FALLTHROUGH */
+#endif /* !defined(CONFIG_NFS_V4_1) */
                case -NFS4ERR_FILE_OPEN:
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
@@ -271,6 +291,353 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
        spin_unlock(&clp->cl_lock);
 }
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * nfs4_free_slot - free a slot and efficiently update slot table.
+ *
+ * freeing a slot is trivially done by clearing its respective bit
+ * in the bitmap.
+ * If the freed slotid equals highest_used_slotid we want to update it
+ * so that the server would be able to size down the slot table if needed,
+ * otherwise we know that the highest_used_slotid is still in use.
+ * When updating highest_used_slotid there may be "holes" in the bitmap
+ * so we need to scan down from highest_used_slotid to 0 looking for the now
+ * highest slotid in use.
+ * If none found, highest_used_slotid is set to -1.
+ */
+static void
+nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
+{
+        int slotid = free_slotid;
+        spin_lock(&tbl->slot_tbl_lock);
+        /* clear used bit in bitmap */
+        __clear_bit(slotid, tbl->used_slots);
+        /* update highest_used_slotid when it is freed */
+        if (slotid == tbl->highest_used_slotid) {
+                slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
+                if (slotid >= 0 && slotid < tbl->max_slots)
+                        tbl->highest_used_slotid = slotid;
+                else
+                        tbl->highest_used_slotid = -1;
+        }
+        rpc_wake_up_next(&tbl->slot_tbl_waitq);
+        spin_unlock(&tbl->slot_tbl_lock);
+        dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__,
+                free_slotid, tbl->highest_used_slotid);
+}
+void nfs41_sequence_free_slot(const struct nfs_client *clp,
+                              struct nfs4_sequence_res *res)
+{
+        struct nfs4_slot_table *tbl;
+        if (!nfs4_has_session(clp)) {
+                dprintk("%s: No session\n", __func__);
+                return;
+        }
+        tbl = &clp->cl_session->fc_slot_table;
+        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
+                dprintk("%s: No slot\n", __func__);
+                /* just wake up the next guy waiting since
+                 * we may have not consumed a slot after all */
+                rpc_wake_up_next(&tbl->slot_tbl_waitq);
+                return;
+        }
+        nfs4_free_slot(tbl, res->sr_slotid);
+        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+}
+static void nfs41_sequence_done(struct nfs_client *clp,
+                                struct nfs4_sequence_res *res,
+                                int rpc_status)
+{
+        unsigned long timestamp;
+        struct nfs4_slot_table *tbl;
+        struct nfs4_slot *slot;
+        /*
+         * sr_status remains 1 if an RPC level error occurred. The server
+         * may or may not have processed the sequence operation..
+         * Proceed as if the server received and processed the sequence
+         * operation.
+         */
+        if (res->sr_status == 1)
+                res->sr_status = NFS_OK;
+        /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
+        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
+                goto out;
+        tbl = &clp->cl_session->fc_slot_table;
+        slot = tbl->slots + res->sr_slotid;
+        if (res->sr_status == 0) {
+                /* Update the slot's sequence and clientid lease timer */
+                ++slot->seq_nr;
+                timestamp = res->sr_renewal_time;
+                spin_lock(&clp->cl_lock);
+                if (time_before(clp->cl_last_renewal, timestamp))
+                        clp->cl_last_renewal = timestamp;
+                spin_unlock(&clp->cl_lock);
+                return;
+        }
+out:
+        /* The session may be reset by one of the error handlers. */
+        dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
+        nfs41_sequence_free_slot(clp, res);
+}
+/*
+ * nfs4_find_slot - efficiently look for a free slot
+ *
+ * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
+ * If found, we mark the slot as used, update the highest_used_slotid,
+ * and respectively set up the sequence operation args.
+ * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise.
+ *
+ * Note: must be called with under the slot_tbl_lock.
+ */
+static u8
+nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
+{
+        int slotid;
+        u8 ret_id = NFS4_MAX_SLOT_TABLE;
+        BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
+        dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
+                __func__, tbl->used_slots[0], tbl->highest_used_slotid,
+                tbl->max_slots);
+        slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
+        if (slotid >= tbl->max_slots)
+                goto out;
+        __set_bit(slotid, tbl->used_slots);
+        if (slotid > tbl->highest_used_slotid)
+                tbl->highest_used_slotid = slotid;
+        ret_id = slotid;
+out:
+        dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
+                __func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
+        return ret_id;
+}
+static int nfs4_recover_session(struct nfs4_session *session)
+{
+        struct nfs_client *clp = session->clp;
+        int ret;
+        for (;;) {
+                ret = nfs4_wait_clnt_recover(clp);
+                if (ret != 0)
+                                return ret;
+                if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
+                        break;
+                nfs4_schedule_state_manager(clp);
+        }
+        return 0;
+}
+static int nfs41_setup_sequence(struct nfs4_session *session,
+                                struct nfs4_sequence_args *args,
+                                struct nfs4_sequence_res *res,
+                                int cache_reply,
+                                struct rpc_task *task)
+{
+        struct nfs4_slot *slot;
+        struct nfs4_slot_table *tbl;
+        int status = 0;
+        u8 slotid;
+        dprintk("--> %s\n", __func__);
+        /* slot already allocated? */
+        if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
+                return 0;
+        memset(res, 0, sizeof(*res));
+        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        tbl = &session->fc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        if (test_bit(NFS4CLNT_SESSION_SETUP, &session->clp->cl_state)) {
+                if (tbl->highest_used_slotid != -1) {
+                        rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+                        spin_unlock(&tbl->slot_tbl_lock);
+                        dprintk("<-- %s: Session reset: draining\n", __func__);
+                        return -EAGAIN;
+                }
+                /* The slot table is empty; start the reset thread */
+                dprintk("%s Session Reset\n", __func__);
+                spin_unlock(&tbl->slot_tbl_lock);
+                status = nfs4_recover_session(session);
+                if (status)
+                        return status;
+                spin_lock(&tbl->slot_tbl_lock);
+        }
+        slotid = nfs4_find_slot(tbl, task);
+        if (slotid == NFS4_MAX_SLOT_TABLE) {
+                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+                spin_unlock(&tbl->slot_tbl_lock);
+                dprintk("<-- %s: no free slots\n", __func__);
+                return -EAGAIN;
+        }
+        spin_unlock(&tbl->slot_tbl_lock);
+        slot = tbl->slots + slotid;
+        args->sa_session = session;
+        args->sa_slotid = slotid;
+        args->sa_cache_this = cache_reply;
+        dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
+        res->sr_session = session;
+        res->sr_slotid = slotid;
+        res->sr_renewal_time = jiffies;
+        /*
+         * sr_status is only set in decode_sequence, and so will remain
+         * set to 1 if an rpc level failure occurs.
+         */
+        res->sr_status = 1;
+        return 0;
+}
+int nfs4_setup_sequence(struct nfs_client *clp,
+                        struct nfs4_sequence_args *args,
+                        struct nfs4_sequence_res *res,
+                        int cache_reply,
+                        struct rpc_task *task)
+{
+        int ret = 0;
+        dprintk("--> %s clp %p session %p sr_slotid %d\n",
+                __func__, clp, clp->cl_session, res->sr_slotid);
+        if (!nfs4_has_session(clp))
+                goto out;
+        ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
+                                   task);
+        if (ret != -EAGAIN) {
+                /* terminate rpc task */
+                task->tk_status = ret;
+                task->tk_action = NULL;
+        }
+out:
+        dprintk("<-- %s status=%d\n", __func__, ret);
+        return ret;
+}
+struct nfs41_call_sync_data {
+        struct nfs_client *clp;
+        struct nfs4_sequence_args *seq_args;
+        struct nfs4_sequence_res *seq_res;
+        int cache_reply;
+};
+static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs41_call_sync_data *data = calldata;
+        dprintk("--> %s data->clp->cl_session %p\n", __func__,
+                data->clp->cl_session);
+        if (nfs4_setup_sequence(data->clp, data->seq_args,
+                                data->seq_res, data->cache_reply, task))
+                return;
+        rpc_call_start(task);
+}
+static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs41_call_sync_data *data = calldata;
+        nfs41_sequence_done(data->clp, data->seq_res, task->tk_status);
+        nfs41_sequence_free_slot(data->clp, data->seq_res);
+}
+struct rpc_call_ops nfs41_call_sync_ops = {
+        .rpc_call_prepare = nfs41_call_sync_prepare,
+        .rpc_call_done = nfs41_call_sync_done,
+};
+static int nfs4_call_sync_sequence(struct nfs_client *clp,
+                                   struct rpc_clnt *clnt,
+                                   struct rpc_message *msg,
+                                   struct nfs4_sequence_args *args,
+                                   struct nfs4_sequence_res *res,
+                                   int cache_reply)
+{
+        int ret;
+        struct rpc_task *task;
+        struct nfs41_call_sync_data data = {
+                .clp = clp,
+                .seq_args = args,
+                .seq_res = res,
+                .cache_reply = cache_reply,
+        };
+        struct rpc_task_setup task_setup = {
+                .rpc_client = clnt,
+                .rpc_message = msg,
+                .callback_ops = &nfs41_call_sync_ops,
+                .callback_data = &data
+        };
+        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        task = rpc_run_task(&task_setup);
+        if (IS_ERR(task))
+                ret = PTR_ERR(task);
+        else {
+                ret = task->tk_status;
+                rpc_put_task(task);
+        }
+        return ret;
+}
+int _nfs4_call_sync_session(struct nfs_server *server,
+                            struct rpc_message *msg,
+                            struct nfs4_sequence_args *args,
+                            struct nfs4_sequence_res *res,
+                            int cache_reply)
+{
+        return nfs4_call_sync_sequence(server->nfs_client, server->client,
+                                       msg, args, res, cache_reply);
+}
+#endif /* CONFIG_NFS_V4_1 */
+int _nfs4_call_sync(struct nfs_server *server,
+                    struct rpc_message *msg,
+                    struct nfs4_sequence_args *args,
+                    struct nfs4_sequence_res *res,
+                    int cache_reply)
+{
+        args->sa_session = res->sr_session = NULL;
+        return rpc_call_sync(server->client, msg, 0);
+}
+#define nfs4_call_sync(server, msg, args, res, cache_reply) \
+        (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
+                        &(res)->seq_res, (cache_reply))
+static void nfs4_sequence_done(const struct nfs_server *server,
+                               struct nfs4_sequence_res *res, int rpc_status)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (nfs4_has_session(server->nfs_client))
+                nfs41_sequence_done(server->nfs_client, res, rpc_status);
+#endif /* CONFIG_NFS_V4_1 */
+}
+/* no restart, therefore free slot here */
+static void nfs4_sequence_done_free_slot(const struct nfs_server *server,
+                                         struct nfs4_sequence_res *res,
+                                         int rpc_status)
+{
+        nfs4_sequence_done(server, res, rpc_status);
+        nfs4_sequence_free_slot(server->nfs_client, res);
+}
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
        struct nfs_inode *nfsi = NFS_I(dir);
@@ -312,6 +679,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
        p->o_res.server = p->o_arg.server;
        nfs_fattr_init(&p->f_attr);
        nfs_fattr_init(&p->dir_attr);
+        p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -804,16 +1172,30 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                err = _nfs4_open_delegation_recall(ctx, state, stateid);
                switch (err) {
                        case 0:
-                                return err;
+                        case -ENOENT:
+                        case -ESTALE:
+                                goto out;
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
                                /* Don't recall a delegation if it was lost */
                                nfs4_schedule_state_recovery(server->nfs_client);
-                                return err;
+                                goto out;
+                        case -ERESTARTSYS:
+                                /*
+                                 * The show must go on: exit, but mark the
+                                 * stateid as needing recovery.
+                                 */
+                        case -NFS4ERR_ADMIN_REVOKED:
+                        case -NFS4ERR_BAD_STATEID:
+                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                        case -ENOMEM:
+                                err = 0;
+                                goto out;
                }
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
+out:
        return err;
 }
@@ -929,6 +1311,10 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
                nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
        }
        data->timestamp = jiffies;
+        if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
+                                &data->o_arg.seq_args,
+                                &data->o_res.seq_res, 1, task))
+                return;
        rpc_call_start(task);
        return;
 out_no_action:
@@ -941,6 +1327,10 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
        struct nfs4_opendata *data = calldata;
        data->rpc_status = task->tk_status;
+        nfs4_sequence_done_free_slot(data->o_arg.server, &data->o_res.seq_res,
+                                     task->tk_status);
        if (RPC_ASSASSINATED(task))
                return;
        if (task->tk_status == 0) {
@@ -1269,7 +1659,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        } else
                memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (status == 0 && state != NULL)
                renew_lease(server, timestamp);
        return status;
@@ -1318,6 +1708,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
        struct nfs4_state *state = calldata->state;
        struct nfs_server *server = NFS_SERVER(calldata->inode);
+        nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status);
        if (RPC_ASSASSINATED(task))
                return;
        /* hmm. we are done with the inode, and in the process of freeing
@@ -1336,10 +1727,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                                break;
                default:
                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
-                                rpc_restart_call(task);
+                                nfs4_restart_rpc(task, server->nfs_client);
                                return;
                        }
        }
+        nfs4_sequence_free_slot(server->nfs_client, &calldata->res.seq_res);
        nfs_refresh_inode(calldata->inode, calldata->res.fattr);
 }
@@ -1380,6 +1772,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                calldata->arg.fmode = FMODE_WRITE;
        }
        calldata->timestamp = jiffies;
+        if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client,
+                                &calldata->arg.seq_args, &calldata->res.seq_res,
+                                1, task))
+                return;
        rpc_call_start(task);
 }
@@ -1419,13 +1815,15 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        };
        int status = -ENOMEM;
-        calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
+        calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
        if (calldata == NULL)
                goto out;
        calldata->inode = state->inode;
        calldata->state = state;
        calldata->arg.fh = NFS_FH(state->inode);
        calldata->arg.stateid = &state->open_stateid;
+        if (nfs4_has_session(server->nfs_client))
+                memset(calldata->arg.stateid->data, 0, 4);    /* clear seqid */
        /* Serialization for the sequence id */
        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
        if (calldata->arg.seqid == NULL)
@@ -1435,6 +1833,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
+        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        calldata->path.mnt = mntget(path->mnt);
        calldata->path.dentry = dget(path->dentry);
@@ -1584,15 +1983,18 @@ void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
+        struct nfs4_server_caps_arg args = {
+                .fhandle = fhandle,
+        };
        struct nfs4_server_caps_res res = {};
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS],
-                .rpc_argp = fhandle,
+                .rpc_argp = &args,
                .rpc_resp = &res,
        };
        int status;
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        if (status == 0) {
                memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
                if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
@@ -1606,6 +2008,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
                server->acl_bitmask = res.acl_bitmask;
        }
        return status;
 }
@@ -1637,8 +2040,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
        nfs_fattr_init(info->fattr);
-        return rpc_call_sync(server->client, &msg, 0);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -1728,7 +2132,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
        };
        
        nfs_fattr_init(fattr);
-        return rpc_call_sync(server->client, &msg, 0);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
@@ -1812,7 +2216,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d
        nfs_fattr_init(fattr);
        dprintk("NFS call  lookupfh %s\n", name->name);
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        dprintk("NFS reply lookupfh: %d\n", status);
        return status;
 }
@@ -1898,7 +2302,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                        args.access |= NFS4_ACCESS_EXECUTE;
        }
        nfs_fattr_init(&fattr);
-        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        if (!status) {
                entry->mask = 0;
                if (res.access & NFS4_ACCESS_READ)
@@ -1957,13 +2361,14 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
                .pglen    = pglen,
                .pages    = &page,
        };
+        struct nfs4_readlink_res res;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK],
                .rpc_argp = &args,
-                .rpc_resp = NULL,
+                .rpc_resp = &res,
        };
-        return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+        return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
 }
 static int nfs4_proc_readlink(struct inode *inode, struct page *page,
@@ -2057,7 +2462,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
        int                     status;
        nfs_fattr_init(&res.dir_attr);
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 1);
        if (status == 0) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, &res.dir_attr);
@@ -2092,8 +2497,10 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
        struct nfs_removeres *res = task->tk_msg.rpc_resp;
+        nfs4_sequence_done(res->server, &res->seq_res, task->tk_status);
        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
+        nfs4_sequence_free_slot(res->server->nfs_client, &res->seq_res);
        update_changeattr(dir, &res->cinfo);
        nfs_post_op_update_inode(dir, &res->dir_attr);
        return 1;
@@ -2125,7 +2532,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
        
        nfs_fattr_init(res.old_fattr);
        nfs_fattr_init(res.new_fattr);
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(old_dir, &res.old_cinfo);
@@ -2174,7 +2581,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
        nfs_fattr_init(res.fattr);
        nfs_fattr_init(res.dir_attr);
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2235,7 +2642,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
 {
-        int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+        int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg,
+                                    &data->arg, &data->res, 1);
        if (status == 0) {
                update_changeattr(dir, &data->res.dir_cinfo);
                nfs_post_op_update_inode(dir, data->res.dir_fattr);
@@ -2344,7 +2752,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                        (unsigned long long)cookie);
        nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
        res.pgbase = args.pgbase;
-        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+        status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
        if (status == 0)
                memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
@@ -2422,14 +2830,17 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
                .fh = fhandle,
                .bitmask = server->attr_bitmask,
        };
+        struct nfs4_statfs_res res = {
+                .fsstat = fsstat,
+        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS],
                .rpc_argp = &args,
-                .rpc_resp = fsstat,
+                .rpc_resp = &res,
        };
        nfs_fattr_init(fsstat->fattr);
-        return rpc_call_sync(server->client, &msg, 0);
+        return  nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
@@ -2451,13 +2862,16 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
                .fh = fhandle,
                .bitmask = server->attr_bitmask,
        };
+        struct nfs4_fsinfo_res res = {
+                .fsinfo = fsinfo,
+        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO],
                .rpc_argp = &args,
-                .rpc_resp = fsinfo,
+                .rpc_resp = &res,
        };
-        return rpc_call_sync(server->client, &msg, 0);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
@@ -2486,10 +2900,13 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
                .fh = fhandle,
                .bitmask = server->attr_bitmask,
        };
+        struct nfs4_pathconf_res res = {
+                .pathconf = pathconf,
+        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF],
                .rpc_argp = &args,
-                .rpc_resp = pathconf,
+                .rpc_resp = &res,
        };
        /* None of the pathconf attributes are mandatory to implement */
@@ -2499,7 +2916,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
        }
        nfs_fattr_init(pathconf->fattr);
-        return rpc_call_sync(server->client, &msg, 0);
+        return nfs4_call_sync(server, &msg, &args, &res, 0);
 }
 static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -2520,8 +2937,13 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
+        dprintk("--> %s\n", __func__);
+        /* nfs4_sequence_free_slot called in the read rpc_call_done */
+        nfs4_sequence_done(server, &data->res.seq_res, task->tk_status);
        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
-                rpc_restart_call(task);
+                nfs4_restart_rpc(task, server->nfs_client);
                return -EAGAIN;
        }
@@ -2541,8 +2963,12 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
+        /* slot is freed in nfs_writeback_done */
+        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
+                           task->tk_status);
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
-                rpc_restart_call(task);
+                nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
        }
        if (task->tk_status >= 0) {
@@ -2567,10 +2993,14 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
+        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
+                           task->tk_status);
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
-                rpc_restart_call(task);
+                nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
        }
+        nfs4_sequence_free_slot(NFS_SERVER(inode)->nfs_client,
+                                &data->res.seq_res);
        nfs_refresh_inode(inode, data->res.fattr);
        return 0;
 }
@@ -2603,6 +3033,9 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
        if (time_before(clp->cl_last_renewal,timestamp))
                clp->cl_last_renewal = timestamp;
        spin_unlock(&clp->cl_lock);
+        dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__,
+                                task->tk_msg.rpc_cred);
+        put_rpccred(task->tk_msg.rpc_cred);
 }
 static const struct rpc_call_ops nfs4_renew_ops = {
@@ -2742,12 +3175,14 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
                .acl_pages = pages,
                .acl_len = buflen,
        };
-        size_t resp_len = buflen;
+        struct nfs_getaclres res = {
+                .acl_len = buflen,
+        };
        void *resp_buf;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
                .rpc_argp = &args,
-                .rpc_resp = &resp_len,
+                .rpc_resp = &res,
        };
        struct page *localpage = NULL;
        int ret;
@@ -2761,26 +3196,26 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
                        return -ENOMEM;
                args.acl_pages[0] = localpage;
                args.acl_pgbase = 0;
-                resp_len = args.acl_len = PAGE_SIZE;
+                args.acl_len = PAGE_SIZE;
        } else {
                resp_buf = buf;
                buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
        }
-        ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+        ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
        if (ret)
                goto out_free;
-        if (resp_len > args.acl_len)
+        if (res.acl_len > args.acl_len)
-                nfs4_write_cached_acl(inode, NULL, resp_len);
+                nfs4_write_cached_acl(inode, NULL, res.acl_len);
        else
-                nfs4_write_cached_acl(inode, resp_buf, resp_len);
+                nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
        if (buf) {
                ret = -ERANGE;
-                if (resp_len > buflen)
+                if (res.acl_len > buflen)
                        goto out_free;
                if (localpage)
-                        memcpy(buf, resp_buf, resp_len);
+                        memcpy(buf, resp_buf, res.acl_len);
        }
-        ret = resp_len;
+        ret = res.acl_len;
 out_free:
        if (localpage)
                __free_page(localpage);
@@ -2810,8 +3245,6 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
        ret = nfs_revalidate_inode(server, inode);
        if (ret < 0)
                return ret;
-        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
-                nfs_zap_acl_cache(inode);
        ret = nfs4_read_cached_acl(inode, buf, buflen);
        if (ret != -ENOENT)
                return ret;
@@ -2827,10 +3260,11 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
                .acl_pages      = pages,
                .acl_len        = buflen,
        };
+        struct nfs_setaclres res;
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETACL],
                .rpc_argp       = &arg,
-                .rpc_resp       = NULL,
+                .rpc_resp       = &res,
        };
        int ret;
@@ -2838,7 +3272,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
                return -EOPNOTSUPP;
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
-        ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
        return ret;
@@ -2857,10 +3291,8 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 }
 static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state)
 {
-        struct nfs_client *clp = server->nfs_client;
        if (!clp || task->tk_status >= 0)
                return 0;
        switch(task->tk_status) {
@@ -2879,8 +3311,23 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
                        task->tk_status = 0;
                        return -EAGAIN;
+#if defined(CONFIG_NFS_V4_1)
+                case -NFS4ERR_BADSESSION:
+                case -NFS4ERR_BADSLOT:
+                case -NFS4ERR_BAD_HIGH_SLOT:
+                case -NFS4ERR_DEADSESSION:
+                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                case -NFS4ERR_SEQ_FALSE_RETRY:
+                case -NFS4ERR_SEQ_MISORDERED:
+                        dprintk("%s ERROR %d, Reset session\n", __func__,
+                                task->tk_status);
+                        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+                        task->tk_status = 0;
+                        return -EAGAIN;
+#endif /* CONFIG_NFS_V4_1 */
                case -NFS4ERR_DELAY:
-                        nfs_inc_server_stats(server, NFSIOS_DELAY);
+                        if (server)
+                                nfs_inc_server_stats(server, NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
                        rpc_delay(task, NFS4_POLL_RETRY_MAX);
                        task->tk_status = 0;
@@ -2893,6 +3340,12 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        return 0;
 }
+static int
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+{
+        return _nfs4_async_handle_error(task, server, server->nfs_client, state);
+}
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
        nfs4_verifier sc_verifier;
@@ -3000,6 +3453,10 @@ struct nfs4_delegreturndata {
 static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_delegreturndata *data = calldata;
+        nfs4_sequence_done_free_slot(data->res.server, &data->res.seq_res,
+                                     task->tk_status);
        data->rpc_status = task->tk_status;
        if (data->rpc_status == 0)
                renew_lease(data->res.server, data->timestamp);
@@ -3010,7 +3467,25 @@ static void nfs4_delegreturn_release(void *calldata)
        kfree(calldata);
 }
+#if defined(CONFIG_NFS_V4_1)
+static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
+{
+        struct nfs4_delegreturndata *d_data;
+        d_data = (struct nfs4_delegreturndata *)data;
+        if (nfs4_setup_sequence(d_data->res.server->nfs_client,
+                                &d_data->args.seq_args,
+                                &d_data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs4_delegreturn_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs4_delegreturn_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs4_delegreturn_done,
        .rpc_release = nfs4_delegreturn_release,
 };
@@ -3032,7 +3507,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        };
        int status = 0;
-        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
                return -ENOMEM;
        data->args.fhandle = &data->fh;
@@ -3042,6 +3517,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        memcpy(&data->stateid, stateid, sizeof(data->stateid));
        data->res.fattr = &data->fattr;
        data->res.server = server;
+        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
@@ -3127,7 +3603,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
        arg.lock_owner.id = lsp->ls_id.id;
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        switch (status) {
                case 0:
                        request->fl_type = F_UNLCK;
@@ -3187,13 +3663,14 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        struct nfs4_unlockdata *p;
        struct inode *inode = lsp->ls_state->inode;
-        p = kmalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (p == NULL)
                return NULL;
        p->arg.fh = NFS_FH(inode);
        p->arg.fl = &p->fl;
        p->arg.seqid = seqid;
        p->res.seqid = seqid;
+        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->arg.stateid = &lsp->ls_stateid;
        p->lsp = lsp;
        atomic_inc(&lsp->ls_count);
@@ -3217,6 +3694,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 {
        struct nfs4_unlockdata *calldata = data;
+        nfs4_sequence_done(calldata->server, &calldata->res.seq_res,
+                           task->tk_status);
        if (RPC_ASSASSINATED(task))
                return;
        switch (task->tk_status) {
@@ -3233,8 +3712,11 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                        break;
                default:
                        if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
-                                rpc_restart_call(task);
+                                nfs4_restart_rpc(task,
+                                                calldata->server->nfs_client);
        }
+        nfs4_sequence_free_slot(calldata->server->nfs_client,
+                                &calldata->res.seq_res);
 }
 static void nfs4_locku_prepare(struct rpc_task *task, void *data)
@@ -3249,6 +3731,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
                return;
        }
        calldata->timestamp = jiffies;
+        if (nfs4_setup_sequence(calldata->server->nfs_client,
+                                &calldata->arg.seq_args,
+                                &calldata->res.seq_res, 1, task))
+                return;
        rpc_call_start(task);
 }
@@ -3341,6 +3827,7 @@ struct nfs4_lockdata {
        unsigned long timestamp;
        int rpc_status;
        int cancelled;
+        struct nfs_server *server;
 };
 static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
@@ -3366,7 +3853,9 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
        p->res.lock_seqid = p->arg.lock_seqid;
+        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->lsp = lsp;
+        p->server = server;
        atomic_inc(&lsp->ls_count);
        p->ctx = get_nfs_open_context(ctx);
        memcpy(&p->fl, fl, sizeof(p->fl));
@@ -3396,6 +3885,9 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        } else
                data->arg.new_lock_owner = 0;
        data->timestamp = jiffies;
+        if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
        rpc_call_start(task);
        dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
@@ -3406,6 +3898,9 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
        dprintk("%s: begin!\n", __func__);
+        nfs4_sequence_done_free_slot(data->server, &data->res.seq_res,
+                                     task->tk_status);
        data->rpc_status = task->tk_status;
        if (RPC_ASSASSINATED(task))
                goto out;
@@ -3487,8 +3982,6 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
        ret = nfs4_wait_for_completion_rpc_task(task);
        if (ret == 0) {
                ret = data->rpc_status;
-                if (ret == -NFS4ERR_DENIED)
-                        ret = -EAGAIN;
        } else
                data->cancelled = 1;
        rpc_put_task(task);
@@ -3576,9 +4069,11 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
        int err;
        do {
+                err = _nfs4_proc_setlk(state, cmd, request);
+                if (err == -NFS4ERR_DENIED)
+                        err = -EAGAIN;
                err = nfs4_handle_exception(NFS_SERVER(state->inode),
-                                _nfs4_proc_setlk(state, cmd, request),
+                                err, &exception);
-                                &exception);
        } while (exception.retry);
        return err;
 }
@@ -3598,15 +4093,23 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
        if (request->fl_start < 0 || request->fl_end < 0)
                return -EINVAL;
-        if (IS_GETLK(cmd))
+        if (IS_GETLK(cmd)) {
-                return nfs4_proc_getlk(state, F_GETLK, request);
+                if (state != NULL)
+                        return nfs4_proc_getlk(state, F_GETLK, request);
+                return 0;
+        }
        if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
                return -EINVAL;
-        if (request->fl_type == F_UNLCK)
+        if (request->fl_type == F_UNLCK) {
-                return nfs4_proc_unlck(state, cmd, request);
+                if (state != NULL)
+                        return nfs4_proc_unlck(state, cmd, request);
+                return 0;
+        }
+        if (state == NULL)
+                return -ENOLCK;
        do {
                status = nfs4_proc_setlk(state, cmd, request);
                if ((status != -EAGAIN) || IS_SETLK(cmd))
@@ -3630,8 +4133,37 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                goto out;
        do {
                err = _nfs4_do_setlk(state, F_SETLK, fl, 0);
-                if (err != -NFS4ERR_DELAY)
+                switch (err) {
-                        break;
+                        default:
+                                printk(KERN_ERR "%s: unhandled error %d.\n",
+                                                __func__, err);
+                        case 0:
+                        case -ESTALE:
+                                goto out;
+                        case -NFS4ERR_EXPIRED:
+                        case -NFS4ERR_STALE_CLIENTID:
+                        case -NFS4ERR_STALE_STATEID:
+                                nfs4_schedule_state_recovery(server->nfs_client);
+                                goto out;
+                        case -ERESTARTSYS:
+                                /*
+                                 * The show must go on: exit, but mark the
+                                 * stateid as needing recovery.
+                                 */
+                        case -NFS4ERR_ADMIN_REVOKED:
+                        case -NFS4ERR_BAD_STATEID:
+                        case -NFS4ERR_OPENMODE:
+                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                err = 0;
+                                goto out;
+                        case -ENOMEM:
+                        case -NFS4ERR_DENIED:
+                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
+                                err = 0;
+                                goto out;
+                        case -NFS4ERR_DELAY:
+                                break;
+                }
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
 out:
@@ -3706,10 +4238,13 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                .page = page,
                .bitmask = bitmask,
        };
+        struct nfs4_fs_locations_res res = {
+                .fs_locations = fs_locations,
+        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
                .rpc_argp = &args,
-                .rpc_resp = fs_locations,
+                .rpc_resp = &res,
        };
        int status;
@@ -3717,24 +4252,736 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
        nfs_fattr_init(&fs_locations->fattr);
        fs_locations->server = server;
        fs_locations->nlocations = 0;
-        status = rpc_call_sync(server->client, &msg, 0);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        nfs_fixup_referral_attributes(&fs_locations->fattr);
        dprintk("%s: returned status = %d\n", __func__, status);
        return status;
 }
-struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
+#ifdef CONFIG_NFS_V4_1
+/*
+ * nfs4_proc_exchange_id()
+ *
+ * Since the clientid has expired, all compounds using sessions
+ * associated with the stale clientid will be returning
+ * NFS4ERR_BADSESSION in the sequence operation, and will therefore
+ * be in some phase of session reset.
+ */
+static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        nfs4_verifier verifier;
+        struct nfs41_exchange_id_args args = {
+                .client = clp,
+                .flags = clp->cl_exchange_flags,
+        };
+        struct nfs41_exchange_id_res res = {
+                .client = clp,
+        };
+        int status;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+                .rpc_cred = cred,
+        };
+        __be32 *p;
+        dprintk("--> %s\n", __func__);
+        BUG_ON(clp == NULL);
+        p = (u32 *)verifier.data;
+        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
+        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
+        args.verifier = &verifier;
+        while (1) {
+                args.id_len = scnprintf(args.id, sizeof(args.id),
+                                        "%s/%s %u",
+                                        clp->cl_ipaddr,
+                                        rpc_peeraddr2str(clp->cl_rpcclient,
+                                                         RPC_DISPLAY_ADDR),
+                                        clp->cl_id_uniquifier);
+                status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+                if (status != NFS4ERR_CLID_INUSE)
+                        break;
+                if (signalled())
+                        break;
+                if (++clp->cl_id_uniquifier == 0)
+                        break;
+        }
+        dprintk("<-- %s status= %d\n", __func__, status);
+        return status;
+}
+struct nfs4_get_lease_time_data {
+        struct nfs4_get_lease_time_args *args;
+        struct nfs4_get_lease_time_res *res;
+        struct nfs_client *clp;
+};
+static void nfs4_get_lease_time_prepare(struct rpc_task *task,
+                                        void *calldata)
+{
+        int ret;
+        struct nfs4_get_lease_time_data *data =
+                        (struct nfs4_get_lease_time_data *)calldata;
+        dprintk("--> %s\n", __func__);
+        /* just setup sequence, do not trigger session recovery
+           since we're invoked within one */
+        ret = nfs41_setup_sequence(data->clp->cl_session,
+                                        &data->args->la_seq_args,
+                                        &data->res->lr_seq_res, 0, task);
+        BUG_ON(ret == -EAGAIN);
+        rpc_call_start(task);
+        dprintk("<-- %s\n", __func__);
+}
+/*
+ * Called from nfs4_state_manager thread for session setup, so don't recover
+ * from sequence operation or clientid errors.
+ */
+static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_get_lease_time_data *data =
+                        (struct nfs4_get_lease_time_data *)calldata;
+        dprintk("--> %s\n", __func__);
+        nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status);
+        switch (task->tk_status) {
+        case -NFS4ERR_DELAY:
+        case -NFS4ERR_GRACE:
+                dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
+                rpc_delay(task, NFS4_POLL_RETRY_MIN);
+                task->tk_status = 0;
+                nfs4_restart_rpc(task, data->clp);
+                return;
+        }
+        nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res);
+        dprintk("<-- %s\n", __func__);
+}
+struct rpc_call_ops nfs4_get_lease_time_ops = {
+        .rpc_call_prepare = nfs4_get_lease_time_prepare,
+        .rpc_call_done = nfs4_get_lease_time_done,
+};
+int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
+{
+        struct rpc_task *task;
+        struct nfs4_get_lease_time_args args;
+        struct nfs4_get_lease_time_res res = {
+                .lr_fsinfo = fsinfo,
+        };
+        struct nfs4_get_lease_time_data data = {
+                .args = &args,
+                .res = &res,
+                .clp = clp,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        struct rpc_task_setup task_setup = {
+                .rpc_client = clp->cl_rpcclient,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_get_lease_time_ops,
+                .callback_data = &data
+        };
+        int status;
+        res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+        dprintk("--> %s\n", __func__);
+        task = rpc_run_task(&task_setup);
+        if (IS_ERR(task))
+                status = PTR_ERR(task);
+        else {
+                status = task->tk_status;
+                rpc_put_task(task);
+        }
+        dprintk("<-- %s return %d\n", __func__, status);
+        return status;
+}
+/*
+ * Reset a slot table
+ */
+static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots,
+                int old_max_slots, int ivalue)
+{
+        int i;
+        int ret = 0;
+        dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl);
+        /*
+         * Until we have dynamic slot table adjustment, insist
+         * upon the same slot table size
+         */
+        if (max_slots != old_max_slots) {
+                dprintk("%s reset slot table does't match old\n",
+                        __func__);
+                ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */
+                goto out;
+        }
+        spin_lock(&tbl->slot_tbl_lock);
+        for (i = 0; i < max_slots; ++i)
+                tbl->slots[i].seq_nr = ivalue;
+        tbl->highest_used_slotid = -1;
+        spin_unlock(&tbl->slot_tbl_lock);
+        dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+                tbl, tbl->slots, tbl->max_slots);
+out:
+        dprintk("<-- %s: return %d\n", __func__, ret);
+        return ret;
+}
+/*
+ * Reset the forechannel and backchannel slot tables
+ */
+static int nfs4_reset_slot_tables(struct nfs4_session *session)
+{
+        int status;
+        status = nfs4_reset_slot_table(&session->fc_slot_table,
+                        session->fc_attrs.max_reqs,
+                        session->fc_slot_table.max_slots,
+                        1);
+        if (status)
+                return status;
+        status = nfs4_reset_slot_table(&session->bc_slot_table,
+                        session->bc_attrs.max_reqs,
+                        session->bc_slot_table.max_slots,
+                        0);
+        return status;
+}
+/* Destroy the slot table */
+static void nfs4_destroy_slot_tables(struct nfs4_session *session)
+{
+        if (session->fc_slot_table.slots != NULL) {
+                kfree(session->fc_slot_table.slots);
+                session->fc_slot_table.slots = NULL;
+        }
+        if (session->bc_slot_table.slots != NULL) {
+                kfree(session->bc_slot_table.slots);
+                session->bc_slot_table.slots = NULL;
+        }
+        return;
+}
+/*
+ * Initialize slot table
+ */
+static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
+                int max_slots, int ivalue)
+{
+        int i;
+        struct nfs4_slot *slot;
+        int ret = -ENOMEM;
+        BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
+        dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
+        slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
+        if (!slot)
+                goto out;
+        for (i = 0; i < max_slots; ++i)
+                slot[i].seq_nr = ivalue;
+        ret = 0;
+        spin_lock(&tbl->slot_tbl_lock);
+        if (tbl->slots != NULL) {
+                spin_unlock(&tbl->slot_tbl_lock);
+                dprintk("%s: slot table already initialized. tbl=%p slots=%p\n",
+                        __func__, tbl, tbl->slots);
+                WARN_ON(1);
+                goto out_free;
+        }
+        tbl->max_slots = max_slots;
+        tbl->slots = slot;
+        tbl->highest_used_slotid = -1;  /* no slot is currently used */
+        spin_unlock(&tbl->slot_tbl_lock);
+        dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+                tbl, tbl->slots, tbl->max_slots);
+out:
+        dprintk("<-- %s: return %d\n", __func__, ret);
+        return ret;
+out_free:
+        kfree(slot);
+        goto out;
+}
+/*
+ * Initialize the forechannel and backchannel tables
+ */
+static int nfs4_init_slot_tables(struct nfs4_session *session)
+{
+        int status;
+        status = nfs4_init_slot_table(&session->fc_slot_table,
+                        session->fc_attrs.max_reqs, 1);
+        if (status)
+                return status;
+        status = nfs4_init_slot_table(&session->bc_slot_table,
+                        session->bc_attrs.max_reqs, 0);
+        if (status)
+                nfs4_destroy_slot_tables(session);
+        return status;
+}
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+        struct nfs4_session *session;
+        struct nfs4_slot_table *tbl;
+        session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
+        if (!session)
+                return NULL;
+        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+        /*
+         * The create session reply races with the server back
+         * channel probe. Mark the client NFS_CS_SESSION_INITING
+         * so that the client back channel can find the
+         * nfs_client struct
+         */
+        clp->cl_cons_state = NFS_CS_SESSION_INITING;
+        tbl = &session->fc_slot_table;
+        spin_lock_init(&tbl->slot_tbl_lock);
+        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        tbl = &session->bc_slot_table;
+        spin_lock_init(&tbl->slot_tbl_lock);
+        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        session->clp = clp;
+        return session;
+}
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+        nfs4_proc_destroy_session(session);
+        dprintk("%s Destroy backchannel for xprt %p\n",
+                __func__, session->clp->cl_rpcclient->cl_xprt);
+        xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt,
+                                NFS41_BC_MIN_CALLBACKS);
+        nfs4_destroy_slot_tables(session);
+        kfree(session);
+}
+/*
+ * Initialize the values to be used by the client in CREATE_SESSION
+ * If nfs4_init_session set the fore channel request and response sizes,
+ * use them.
+ *
+ * Set the back channel max_resp_sz_cached to zero to force the client to
+ * always set csa_cachethis to FALSE because the current implementation
+ * of the back channel DRC only supports caching the CB_SEQUENCE operation.
+ */
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+{
+        struct nfs4_session *session = args->client->cl_session;
+        unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz,
+                     mxresp_sz = session->fc_attrs.max_resp_sz;
+        if (mxrqst_sz == 0)
+                mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
+        if (mxresp_sz == 0)
+                mxresp_sz = NFS_MAX_FILE_IO_SIZE;
+        /* Fore channel attributes */
+        args->fc_attrs.headerpadsz = 0;
+        args->fc_attrs.max_rqst_sz = mxrqst_sz;
+        args->fc_attrs.max_resp_sz = mxresp_sz;
+        args->fc_attrs.max_resp_sz_cached = mxresp_sz;
+        args->fc_attrs.max_ops = NFS4_MAX_OPS;
+        args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
+        dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
+                "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+                __func__,
+                args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
+                args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops,
+                args->fc_attrs.max_reqs);
+        /* Back channel attributes */
+        args->bc_attrs.headerpadsz = 0;
+        args->bc_attrs.max_rqst_sz = PAGE_SIZE;
+        args->bc_attrs.max_resp_sz = PAGE_SIZE;
+        args->bc_attrs.max_resp_sz_cached = 0;
+        args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
+        args->bc_attrs.max_reqs = 1;
+        dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
+                "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+                __func__,
+                args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz,
+                args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops,
+                args->bc_attrs.max_reqs);
+}
+static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd)
+{
+        if (rcvd <= sent)
+                return 0;
+        printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. "
+                "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd);
+        return -EINVAL;
+}
+#define _verify_fore_channel_attr(_name_) \
+        _verify_channel_attr("fore", #_name_, \
+                             args->fc_attrs._name_, \
+                             session->fc_attrs._name_)
+#define _verify_back_channel_attr(_name_) \
+        _verify_channel_attr("back", #_name_, \
+                             args->bc_attrs._name_, \
+                             session->bc_attrs._name_)
+/*
+ * The server is not allowed to increase the fore channel header pad size,
+ * maximum response size, or maximum number of operations.
+ *
+ * The back channel attributes are only negotiatied down: We send what the
+ * (back channel) server insists upon.
+ */
+static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
+                                     struct nfs4_session *session)
+{
+        int ret = 0;
+        ret |= _verify_fore_channel_attr(headerpadsz);
+        ret |= _verify_fore_channel_attr(max_resp_sz);
+        ret |= _verify_fore_channel_attr(max_ops);
+        ret |= _verify_back_channel_attr(headerpadsz);
+        ret |= _verify_back_channel_attr(max_rqst_sz);
+        ret |= _verify_back_channel_attr(max_resp_sz);
+        ret |= _verify_back_channel_attr(max_resp_sz_cached);
+        ret |= _verify_back_channel_attr(max_ops);
+        ret |= _verify_back_channel_attr(max_reqs);
+        return ret;
+}
+static int _nfs4_proc_create_session(struct nfs_client *clp)
+{
+        struct nfs4_session *session = clp->cl_session;
+        struct nfs41_create_session_args args = {
+                .client = clp,
+                .cb_program = NFS4_CALLBACK,
+        };
+        struct nfs41_create_session_res res = {
+                .client = clp,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        int status;
+        nfs4_init_channel_attrs(&args);
+        args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
+        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
+        if (!status)
+                /* Verify the session's negotiated channel_attrs values */
+                status = nfs4_verify_channel_attrs(&args, session);
+        if (!status) {
+                /* Increment the clientid slot sequence id */
+                clp->cl_seqid++;
+        }
+        return status;
+}
+/*
+ * Issues a CREATE_SESSION operation to the server.
+ * It is the responsibility of the caller to verify the session is
+ * expired before calling this routine.
+ */
+int nfs4_proc_create_session(struct nfs_client *clp, int reset)
+{
+        int status;
+        unsigned *ptr;
+        struct nfs_fsinfo fsinfo;
+        struct nfs4_session *session = clp->cl_session;
+        dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
+        status = _nfs4_proc_create_session(clp);
+        if (status)
+                goto out;
+        /* Init or reset the fore channel */
+        if (reset)
+                status = nfs4_reset_slot_tables(session);
+        else
+                status = nfs4_init_slot_tables(session);
+        dprintk("fore channel slot table initialization returned %d\n", status);
+        if (status)
+                goto out;
+        ptr = (unsigned *)&session->sess_id.data[0];
+        dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
+                clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+        if (reset)
+                /* Lease time is aleady set */
+                goto out;
+        /* Get the lease time */
+        status = nfs4_proc_get_lease_time(clp, &fsinfo);
+        if (status == 0) {
+                /* Update lease time and schedule renewal */
+                spin_lock(&clp->cl_lock);
+                clp->cl_lease_time = fsinfo.lease_time * HZ;
+                clp->cl_last_renewal = jiffies;
+                clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                spin_unlock(&clp->cl_lock);
+                nfs4_schedule_state_renewal(clp);
+        }
+out:
+        dprintk("<-- %s\n", __func__);
+        return status;
+}
+/*
+ * Issue the over-the-wire RPC DESTROY_SESSION.
+ * The caller must serialize access to this routine.
+ */
+int nfs4_proc_destroy_session(struct nfs4_session *session)
+{
+        int status = 0;
+        struct rpc_message msg;
+        dprintk("--> nfs4_proc_destroy_session\n");
+        /* session is still being setup */
+        if (session->clp->cl_cons_state != NFS_CS_READY)
+                return status;
+        msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION];
+        msg.rpc_argp = session;
+        msg.rpc_resp = NULL;
+        msg.rpc_cred = NULL;
+        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0);
+        if (status)
+                printk(KERN_WARNING
+                        "Got error %d from the server on DESTROY_SESSION. "
+                        "Session has been destroyed regardless...\n", status);
+        dprintk("<-- nfs4_proc_destroy_session\n");
+        return status;
+}
+int nfs4_init_session(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        int ret;
+        if (!nfs4_has_session(clp))
+                return 0;
+        clp->cl_session->fc_attrs.max_rqst_sz = server->wsize;
+        clp->cl_session->fc_attrs.max_resp_sz = server->rsize;
+        ret = nfs4_recover_expired_lease(server);
+        if (!ret)
+                ret = nfs4_check_client_ready(clp);
+        return ret;
+}
+/*
+ * Renew the cl_session lease.
+ */
+static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        struct nfs4_sequence_args args;
+        struct nfs4_sequence_res res;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+                .rpc_cred = cred,
+        };
+        args.sa_cache_this = 0;
+        return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
+                                       &res, 0);
+}
+void nfs41_sequence_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_client *clp = (struct nfs_client *)data;
+        nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status);
+        if (task->tk_status < 0) {
+                dprintk("%s ERROR %d\n", __func__, task->tk_status);
+                if (_nfs4_async_handle_error(task, NULL, clp, NULL)
+                                                                == -EAGAIN) {
+                        nfs4_restart_rpc(task, clp);
+                        return;
+                }
+        }
+        nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
+        dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
+        put_rpccred(task->tk_msg.rpc_cred);
+        kfree(task->tk_msg.rpc_argp);
+        kfree(task->tk_msg.rpc_resp);
+        dprintk("<-- %s\n", __func__);
+}
+static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
+{
+        struct nfs_client *clp;
+        struct nfs4_sequence_args *args;
+        struct nfs4_sequence_res *res;
+        clp = (struct nfs_client *)data;
+        args = task->tk_msg.rpc_argp;
+        res = task->tk_msg.rpc_resp;
+        if (nfs4_setup_sequence(clp, args, res, 0, task))
+                return;
+        rpc_call_start(task);
+}
+static const struct rpc_call_ops nfs41_sequence_ops = {
+        .rpc_call_done = nfs41_sequence_call_done,
+        .rpc_call_prepare = nfs41_sequence_prepare,
+};
+static int nfs41_proc_async_sequence(struct nfs_client *clp,
+                                     struct rpc_cred *cred)
+{
+        struct nfs4_sequence_args *args;
+        struct nfs4_sequence_res *res;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
+                .rpc_cred = cred,
+        };
+        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        if (!args)
+                return -ENOMEM;
+        res = kzalloc(sizeof(*res), GFP_KERNEL);
+        if (!res) {
+                kfree(args);
+                return -ENOMEM;
+        }
+        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        msg.rpc_argp = args;
+        msg.rpc_resp = res;
+        return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
+                              &nfs41_sequence_ops, (void *)clp);
+}
+#endif /* CONFIG_NFS_V4_1 */
+struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
        .recover_open   = nfs4_open_reclaim,
        .recover_lock   = nfs4_lock_reclaim,
+        .establish_clid = nfs4_init_clientid,
+        .get_clid_cred  = nfs4_get_setclientid_cred,
+};
+#if defined(CONFIG_NFS_V4_1)
+struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
+        .recover_open   = nfs4_open_reclaim,
+        .recover_lock   = nfs4_lock_reclaim,
+        .establish_clid = nfs4_proc_exchange_id,
+        .get_clid_cred  = nfs4_get_exchange_id_cred,
+};
+#endif /* CONFIG_NFS_V4_1 */
+struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
+        .recover_open   = nfs4_open_expired,
+        .recover_lock   = nfs4_lock_expired,
+        .establish_clid = nfs4_init_clientid,
+        .get_clid_cred  = nfs4_get_setclientid_cred,
 };
-struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops = {
+#if defined(CONFIG_NFS_V4_1)
+struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
        .recover_open   = nfs4_open_expired,
        .recover_lock   = nfs4_lock_expired,
+        .establish_clid = nfs4_proc_exchange_id,
+        .get_clid_cred  = nfs4_get_exchange_id_cred,
+};
+#endif /* CONFIG_NFS_V4_1 */
+struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
+        .sched_state_renewal = nfs4_proc_async_renew,
+        .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
+        .renew_lease = nfs4_proc_renew,
+};
+#if defined(CONFIG_NFS_V4_1)
+struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
+        .sched_state_renewal = nfs41_proc_async_sequence,
+        .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
+        .renew_lease = nfs4_proc_sequence,
+};
+#endif
+/*
+ * Per minor version reboot and network partition recovery ops
+ */
+struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = {
+        &nfs40_reboot_recovery_ops,
+#if defined(CONFIG_NFS_V4_1)
+        &nfs41_reboot_recovery_ops,
+#endif
+};
+struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = {
+        &nfs40_nograce_recovery_ops,
+#if defined(CONFIG_NFS_V4_1)
+        &nfs41_nograce_recovery_ops,
+#endif
+};
+struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = {
+        &nfs40_state_renewal_ops,
+#if defined(CONFIG_NFS_V4_1)
+        &nfs41_state_renewal_ops,
+#endif
 };
 static const struct inode_operations nfs4_file_inode_operations = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index f524e932ff7b..e27c6cef18f2 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -59,12 +59,14 @@
 void
 nfs4_renew_state(struct work_struct *work)
 {
+        struct nfs4_state_maintenance_ops *ops;
        struct nfs_client *clp =
                container_of(work, struct nfs_client, cl_renewd.work);
        struct rpc_cred *cred;
        long lease, timeout;
        unsigned long last, now;
+        ops = nfs4_state_renewal_ops[clp->cl_minorversion];
        dprintk("%s: start\n", __func__);
        /* Are there any active superblocks? */
        if (list_empty(&clp->cl_superblocks))
@@ -76,7 +78,7 @@ nfs4_renew_state(struct work_struct *work)
        timeout = (2 * lease) / 3 + (long)last - (long)now;
        /* Are we close to a lease timeout? */
        if (time_after(now, last + lease/3)) {
-                cred = nfs4_get_renew_cred_locked(clp);
+                cred = ops->get_state_renewal_cred_locked(clp);
                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
                        if (list_empty(&clp->cl_delegations)) {
@@ -86,7 +88,7 @@ nfs4_renew_state(struct work_struct *work)
                        nfs_expire_all_delegations(clp);
                } else {
                        /* Queue an asynchronous RENEW. */
-                        nfs4_proc_async_renew(clp, cred);
+                        ops->sched_state_renewal(clp, cred);
                        put_rpccred(cred);
                }
                timeout = (2 * lease) / 3;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0298e909559f..1434080aefeb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -60,7 +60,7 @@ const nfs4_stateid zero_stateid;
 static LIST_HEAD(nfs4_clientid_list);
-static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
        unsigned short port;
        int status;
@@ -77,7 +77,7 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
        return status;
 }
-static struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
 {
        struct rpc_cred *cred = NULL;
@@ -114,17 +114,21 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
        return cred;
 }
-static struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+#if defined(CONFIG_NFS_V4_1)
+struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
        spin_lock(&clp->cl_lock);
-        cred = nfs4_get_renew_cred_locked(clp);
+        cred = nfs4_get_machine_cred_locked(clp);
        spin_unlock(&clp->cl_lock);
        return cred;
 }
-static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+#endif /* CONFIG_NFS_V4_1 */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
@@ -549,6 +553,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        INIT_LIST_HEAD(&lsp->ls_sequence.list);
        lsp->ls_seqid.sequence = &lsp->ls_sequence;
        atomic_set(&lsp->ls_count, 1);
+        lsp->ls_state = state;
        lsp->ls_owner = fl_owner;
        spin_lock(&clp->cl_lock);
        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
@@ -583,7 +588,6 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
                if (lsp != NULL)
                        break;
                if (new != NULL) {
-                        new->ls_state = state;
                        list_add(&new->ls_locks, &state->lock_states);
                        set_bit(LK_STATE_IN_USE, &state->flags);
                        lsp = new;
@@ -738,12 +742,14 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
 {
-        if (status == -NFS4ERR_BAD_SEQID) {
+        struct nfs4_state_owner *sp = container_of(seqid->sequence,
-                struct nfs4_state_owner *sp = container_of(seqid->sequence,
+                                        struct nfs4_state_owner, so_seqid);
-                                struct nfs4_state_owner, so_seqid);
+        struct nfs_server *server = sp->so_server;
+        if (status == -NFS4ERR_BAD_SEQID)
                nfs4_drop_state_owner(sp);
-        }
+        if (!nfs4_has_session(server->nfs_client))
-        nfs_increment_seqid(status, seqid);
+                nfs_increment_seqid(status, seqid);
 }
 /*
@@ -847,32 +853,45 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
        struct file_lock *fl;
        int status = 0;
+        if (inode->i_flock == NULL)
+                return 0;
+        /* Guard against delegation returns and new lock/unlock calls */
        down_write(&nfsi->rwsem);
+        /* Protect inode->i_flock using the BKL */
+        lock_kernel();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
+                unlock_kernel();
                status = ops->recover_lock(state, fl);
-                if (status >= 0)
-                        continue;
                switch (status) {
+                        case 0:
+                                break;
+                        case -ESTALE:
+                        case -NFS4ERR_ADMIN_REVOKED:
+                        case -NFS4ERR_STALE_STATEID:
+                        case -NFS4ERR_BAD_STATEID:
+                        case -NFS4ERR_EXPIRED:
+                        case -NFS4ERR_NO_GRACE:
+                        case -NFS4ERR_STALE_CLIENTID:
+                                goto out;
                        default:
                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
                                                __func__, status);
-                        case -NFS4ERR_EXPIRED:
+                        case -ENOMEM:
-                        case -NFS4ERR_NO_GRACE:
+                        case -NFS4ERR_DENIED:
                        case -NFS4ERR_RECLAIM_BAD:
                        case -NFS4ERR_RECLAIM_CONFLICT:
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
-                                break;
+                                status = 0;
-                        case -NFS4ERR_STALE_CLIENTID:
-                                goto out_err;
                }
+                lock_kernel();
        }
-        up_write(&nfsi->rwsem);
+        unlock_kernel();
-        return 0;
+out:
-out_err:
        up_write(&nfsi->rwsem);
        return status;
 }
@@ -918,6 +937,7 @@ restart:
                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
                                                __func__, status);
                        case -ENOENT:
+                        case -ENOMEM:
                        case -ESTALE:
                                /*
                                 * Open state on this file cannot be recovered
@@ -928,6 +948,9 @@ restart:
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
+                        case -NFS4ERR_ADMIN_REVOKED:
+                        case -NFS4ERR_STALE_STATEID:
+                        case -NFS4ERR_BAD_STATEID:
                        case -NFS4ERR_RECLAIM_BAD:
                        case -NFS4ERR_RECLAIM_CONFLICT:
                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
@@ -1042,6 +1065,14 @@ static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                case -NFS4ERR_EXPIRED:
                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                        nfs4_state_start_reclaim_nograce(clp);
+                case -NFS4ERR_BADSESSION:
+                case -NFS4ERR_BADSLOT:
+                case -NFS4ERR_BAD_HIGH_SLOT:
+                case -NFS4ERR_DEADSESSION:
+                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                case -NFS4ERR_SEQ_FALSE_RETRY:
+                case -NFS4ERR_SEQ_MISORDERED:
+                        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
        }
 }
@@ -1075,18 +1106,22 @@ restart:
 static int nfs4_check_lease(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
+        struct nfs4_state_maintenance_ops *ops =
+                nfs4_state_renewal_ops[clp->cl_minorversion];
        int status = -NFS4ERR_EXPIRED;
        /* Is the client already known to have an expired lease? */
        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
                return 0;
-        cred = nfs4_get_renew_cred(clp);
+        spin_lock(&clp->cl_lock);
+        cred = ops->get_state_renewal_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
        if (cred == NULL) {
                cred = nfs4_get_setclientid_cred(clp);
                if (cred == NULL)
                        goto out;
        }
-        status = nfs4_proc_renew(clp, cred);
+        status = ops->renew_lease(clp, cred);
        put_rpccred(cred);
 out:
        nfs4_recovery_handle_error(clp, status);
@@ -1096,21 +1131,98 @@ out:
 static int nfs4_reclaim_lease(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
+        struct nfs4_state_recovery_ops *ops =
+                nfs4_reboot_recovery_ops[clp->cl_minorversion];
        int status = -ENOENT;
-        cred = nfs4_get_setclientid_cred(clp);
+        cred = ops->get_clid_cred(clp);
        if (cred != NULL) {
-                status = nfs4_init_client(clp, cred);
+                status = ops->establish_clid(clp, cred);
                put_rpccred(cred);
                /* Handle case where the user hasn't set up machine creds */
                if (status == -EACCES && cred == clp->cl_machine_cred) {
                        nfs4_clear_machine_cred(clp);
                        status = -EAGAIN;
                }
+                if (status == -NFS4ERR_MINOR_VERS_MISMATCH)
+                        status = -EPROTONOSUPPORT;
+        }
+        return status;
+}
+#ifdef CONFIG_NFS_V4_1
+static void nfs4_session_recovery_handle_error(struct nfs_client *clp, int err)
+{
+        switch (err) {
+        case -NFS4ERR_STALE_CLIENTID:
+                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+        }
+}
+static int nfs4_reset_session(struct nfs_client *clp)
+{
+        int status;
+        status = nfs4_proc_destroy_session(clp->cl_session);
+        if (status && status != -NFS4ERR_BADSESSION &&
+            status != -NFS4ERR_DEADSESSION) {
+                nfs4_session_recovery_handle_error(clp, status);
+                goto out;
        }
+        memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
+        status = nfs4_proc_create_session(clp, 1);
+        if (status)
+                nfs4_session_recovery_handle_error(clp, status);
+                /* fall through*/
+out:
+        /* Wake up the next rpc task even on error */
+        rpc_wake_up_next(&clp->cl_session->fc_slot_table.slot_tbl_waitq);
        return status;
 }
+static int nfs4_initialize_session(struct nfs_client *clp)
+{
+        int status;
+        status = nfs4_proc_create_session(clp, 0);
+        if (!status) {
+                nfs_mark_client_ready(clp, NFS_CS_READY);
+        } else if (status == -NFS4ERR_STALE_CLIENTID) {
+                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+        } else {
+                nfs_mark_client_ready(clp, status);
+        }
+        return status;
+}
+#else /* CONFIG_NFS_V4_1 */
+static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
+static int nfs4_initialize_session(struct nfs_client *clp) { return 0; }
+#endif /* CONFIG_NFS_V4_1 */
+/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
+ * on EXCHANGE_ID for v4.1
+ */
+static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
+{
+        if (nfs4_has_session(clp)) {
+                switch (status) {
+                case -NFS4ERR_DELAY:
+                case -NFS4ERR_CLID_INUSE:
+                case -EAGAIN:
+                        break;
+                case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+                                         * in nfs4_exchange_id */
+                default:
+                        return;
+                }
+        }
+        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+}
 static void nfs4_state_manager(struct nfs_client *clp)
 {
        int status = 0;
@@ -1121,9 +1233,12 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        /* We're going to have to re-establish a clientid */
                        status = nfs4_reclaim_lease(clp);
                        if (status) {
-                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                                nfs4_set_lease_expired(clp, status);
                                if (status == -EAGAIN)
                                        continue;
+                                if (clp->cl_cons_state ==
+                                                        NFS_CS_SESSION_INITING)
+                                        nfs_mark_client_ready(clp, status);
                                goto out_error;
                        }
                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
@@ -1134,25 +1249,44 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        if (status != 0)
                                continue;
                }
+                /* Initialize or reset the session */
+                if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)
+                   && nfs4_has_session(clp)) {
+                        if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+                                status = nfs4_initialize_session(clp);
+                        else
+                                status = nfs4_reset_session(clp);
+                        if (status) {
+                                if (status == -NFS4ERR_STALE_CLIENTID)
+                                        continue;
+                                goto out_error;
+                        }
+                }
                /* First recover reboot state... */
                if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
-                        status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
+                        status = nfs4_do_reclaim(clp,
+                                nfs4_reboot_recovery_ops[clp->cl_minorversion]);
                        if (status == -NFS4ERR_STALE_CLIENTID)
                                continue;
+                        if (test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
+                                continue;
                        nfs4_state_end_reclaim_reboot(clp);
                        continue;
                }
                /* Now recover expired state... */
                if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
-                        status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops);
+                        status = nfs4_do_reclaim(clp,
+                                nfs4_nograce_recovery_ops[clp->cl_minorversion]);
                        if (status < 0) {
                                set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
                                if (status == -NFS4ERR_STALE_CLIENTID)
                                        continue;
                                if (status == -NFS4ERR_EXPIRED)
                                        continue;
+                                if (test_bit(NFS4CLNT_SESSION_SETUP,
+                                                                &clp->cl_state))
+                                        continue;
                                goto out_error;
                        } else
                                nfs4_state_end_reclaim_nograce(clp);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1690f0e44b91..617273e7d47f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -192,12 +192,16 @@ static int nfs4_stat_to_errno(int);
                                 decode_verifier_maxsz)
 #define encode_remove_maxsz     (op_encode_hdr_maxsz + \
                                nfs4_name_maxsz)
+#define decode_remove_maxsz     (op_decode_hdr_maxsz + \
+                                 decode_change_info_maxsz)
 #define encode_rename_maxsz     (op_encode_hdr_maxsz + \
                                2 * nfs4_name_maxsz)
-#define decode_rename_maxsz     (op_decode_hdr_maxsz + 5 + 5)
+#define decode_rename_maxsz     (op_decode_hdr_maxsz + \
+                                 decode_change_info_maxsz + \
+                                 decode_change_info_maxsz)
 #define encode_link_maxsz       (op_encode_hdr_maxsz + \
                                nfs4_name_maxsz)
-#define decode_link_maxsz       (op_decode_hdr_maxsz + 5)
+#define decode_link_maxsz       (op_decode_hdr_maxsz + decode_change_info_maxsz)
 #define encode_lock_maxsz       (op_encode_hdr_maxsz + \
                                 7 + \
                                 1 + encode_stateid_maxsz + 8)
@@ -240,43 +244,115 @@ static int nfs4_stat_to_errno(int);
                                (encode_getattr_maxsz)
 #define decode_fs_locations_maxsz \
                                (0)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MACHINE_NAME_LEN (64)
+#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
+                                encode_verifier_maxsz + \
+                                1 /* co_ownerid.len */ + \
+                                XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+                                1 /* flags */ + \
+                                1 /* spa_how */ + \
+                                0 /* SP4_NONE (for now) */ + \
+                                1 /* zero implemetation id array */)
+#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
+                                2 /* eir_clientid */ + \
+                                1 /* eir_sequenceid */ + \
+                                1 /* eir_flags */ + \
+                                1 /* spr_how */ + \
+                                0 /* SP4_NONE (for now) */ + \
+                                2 /* eir_server_owner.so_minor_id */ + \
+                                /* eir_server_owner.so_major_id<> */ \
+                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+                                /* eir_server_scope<> */ \
+                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+                                1 /* eir_server_impl_id array length */ + \
+                                0 /* ignored eir_server_impl_id contents */)
+#define encode_channel_attrs_maxsz  (6 + 1 /* ca_rdma_ird.len (0) */)
+#define decode_channel_attrs_maxsz  (6 + \
+                                     1 /* ca_rdma_ird.len */ + \
+                                     1 /* ca_rdma_ird */)
+#define encode_create_session_maxsz  (op_encode_hdr_maxsz + \
+                                     2 /* csa_clientid */ + \
+                                     1 /* csa_sequence */ + \
+                                     1 /* csa_flags */ + \
+                                     encode_channel_attrs_maxsz + \
+                                     encode_channel_attrs_maxsz + \
+                                     1 /* csa_cb_program */ + \
+                                     1 /* csa_sec_parms.len (1) */ + \
+                                     1 /* cb_secflavor (AUTH_SYS) */ + \
+                                     1 /* stamp */ + \
+                                     1 /* machinename.len */ + \
+                                     XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \
+                                     1 /* uid */ + \
+                                     1 /* gid */ + \
+                                     1 /* gids.len (0) */)
+#define decode_create_session_maxsz  (op_decode_hdr_maxsz +     \
+                                     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+                                     1 /* csr_sequence */ + \
+                                     1 /* csr_flags */ + \
+                                     decode_channel_attrs_maxsz + \
+                                     decode_channel_attrs_maxsz)
+#define encode_destroy_session_maxsz    (op_encode_hdr_maxsz + 4)
+#define decode_destroy_session_maxsz    (op_decode_hdr_maxsz)
+#define encode_sequence_maxsz   (op_encode_hdr_maxsz + \
+                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
+#define decode_sequence_maxsz   (op_decode_hdr_maxsz + \
+                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+#else /* CONFIG_NFS_V4_1 */
+#define encode_sequence_maxsz   0
+#define decode_sequence_maxsz   0
+#endif /* CONFIG_NFS_V4_1 */
 #define NFS4_enc_compound_sz    (1024)  /* XXX: large enough? */
 #define NFS4_dec_compound_sz    (1024)  /* XXX: large enough? */
 #define NFS4_enc_read_sz        (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_read_maxsz)
 #define NFS4_dec_read_sz        (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_read_maxsz)
 #define NFS4_enc_readlink_sz    (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_readlink_maxsz)
 #define NFS4_dec_readlink_sz    (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_readlink_maxsz)
 #define NFS4_enc_readdir_sz     (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_readdir_maxsz)
 #define NFS4_dec_readdir_sz     (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_readdir_maxsz)
 #define NFS4_enc_write_sz       (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_write_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_write_sz       (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_write_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_commit_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_commit_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_commit_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_commit_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_open_sz        (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_savefh_maxsz + \
                                encode_open_maxsz + \
@@ -285,6 +361,7 @@ static int nfs4_stat_to_errno(int);
                                encode_restorefh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_open_sz        (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_savefh_maxsz + \
                                decode_open_maxsz + \
@@ -301,43 +378,53 @@ static int nfs4_stat_to_errno(int);
                                 decode_putfh_maxsz + \
                                 decode_open_confirm_maxsz)
 #define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \
+                                        encode_sequence_maxsz + \
                                        encode_putfh_maxsz + \
                                        encode_open_maxsz + \
                                        encode_getattr_maxsz)
 #define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
+                                        decode_sequence_maxsz + \
                                        decode_putfh_maxsz + \
                                        decode_open_maxsz + \
                                        decode_getattr_maxsz)
 #define NFS4_enc_open_downgrade_sz \
                                (compound_encode_hdr_maxsz + \
+                                 encode_sequence_maxsz + \
                                 encode_putfh_maxsz + \
                                 encode_open_downgrade_maxsz + \
                                 encode_getattr_maxsz)
 #define NFS4_dec_open_downgrade_sz \
                                (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_open_downgrade_maxsz + \
                                 decode_getattr_maxsz)
 #define NFS4_enc_close_sz       (compound_encode_hdr_maxsz + \
+                                 encode_sequence_maxsz + \
                                 encode_putfh_maxsz + \
                                 encode_close_maxsz + \
                                 encode_getattr_maxsz)
 #define NFS4_dec_close_sz       (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_close_maxsz + \
                                 decode_getattr_maxsz)
 #define NFS4_enc_setattr_sz     (compound_encode_hdr_maxsz + \
+                                 encode_sequence_maxsz + \
                                 encode_putfh_maxsz + \
                                 encode_setattr_maxsz + \
                                 encode_getattr_maxsz)
 #define NFS4_dec_setattr_sz     (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_setattr_maxsz + \
                                 decode_getattr_maxsz)
 #define NFS4_enc_fsinfo_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_fsinfo_maxsz)
 #define NFS4_dec_fsinfo_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_fsinfo_maxsz)
 #define NFS4_enc_renew_sz       (compound_encode_hdr_maxsz + \
@@ -359,64 +446,81 @@ static int nfs4_stat_to_errno(int);
                                decode_putrootfh_maxsz + \
                                decode_fsinfo_maxsz)
 #define NFS4_enc_lock_sz        (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_lock_maxsz)
 #define NFS4_dec_lock_sz        (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_lock_maxsz)
 #define NFS4_enc_lockt_sz       (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_lockt_maxsz)
 #define NFS4_dec_lockt_sz       (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_lockt_maxsz)
 #define NFS4_enc_locku_sz       (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_locku_maxsz)
 #define NFS4_dec_locku_sz       (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_locku_maxsz)
 #define NFS4_enc_access_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_access_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_access_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_access_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_getattr_sz     (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_getattr_sz     (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_lookup_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_lookup_maxsz + \
                                encode_getattr_maxsz + \
                                encode_getfh_maxsz)
 #define NFS4_dec_lookup_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_lookup_maxsz + \
                                decode_getattr_maxsz + \
                                decode_getfh_maxsz)
 #define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putrootfh_maxsz + \
                                encode_getattr_maxsz + \
                                encode_getfh_maxsz)
 #define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putrootfh_maxsz + \
                                decode_getattr_maxsz + \
                                decode_getfh_maxsz)
 #define NFS4_enc_remove_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_remove_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_remove_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
-                                op_decode_hdr_maxsz + 5 + \
+                                decode_remove_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_rename_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_savefh_maxsz + \
                                encode_putfh_maxsz + \
@@ -425,6 +529,7 @@ static int nfs4_stat_to_errno(int);
                                encode_restorefh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_rename_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_savefh_maxsz + \
                                decode_putfh_maxsz + \
@@ -433,6 +538,7 @@ static int nfs4_stat_to_errno(int);
                                decode_restorefh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_link_sz        (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_savefh_maxsz + \
                                encode_putfh_maxsz + \
@@ -441,6 +547,7 @@ static int nfs4_stat_to_errno(int);
                                encode_restorefh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_dec_link_sz        (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_savefh_maxsz + \
                                decode_putfh_maxsz + \
@@ -449,16 +556,19 @@ static int nfs4_stat_to_errno(int);
                                decode_restorefh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_symlink_sz     (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_symlink_maxsz + \
                                encode_getattr_maxsz + \
                                encode_getfh_maxsz)
 #define NFS4_dec_symlink_sz     (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_symlink_maxsz + \
                                decode_getattr_maxsz + \
                                decode_getfh_maxsz)
 #define NFS4_enc_create_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_savefh_maxsz + \
                                encode_create_maxsz + \
@@ -467,6 +577,7 @@ static int nfs4_stat_to_errno(int);
                                encode_restorefh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_create_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_savefh_maxsz + \
                                decode_create_maxsz + \
@@ -475,52 +586,98 @@ static int nfs4_stat_to_errno(int);
                                decode_restorefh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_pathconf_sz    (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_pathconf_sz    (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_statfs_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_statfs_maxsz)
 #define NFS4_dec_statfs_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_statfs_maxsz)
 #define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_delegreturn_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_delegreturn_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_getacl_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_getacl_maxsz)
 #define NFS4_dec_getacl_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_getacl_maxsz)
 #define NFS4_enc_setacl_sz      (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_setacl_maxsz)
 #define NFS4_dec_setacl_sz      (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_setacl_maxsz)
 #define NFS4_enc_fs_locations_sz \
                                (compound_encode_hdr_maxsz + \
+                                 encode_sequence_maxsz + \
                                 encode_putfh_maxsz + \
                                 encode_lookup_maxsz + \
                                 encode_fs_locations_maxsz)
 #define NFS4_dec_fs_locations_sz \
                                (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz + \
                                 decode_putfh_maxsz + \
                                 decode_lookup_maxsz + \
                                 decode_fs_locations_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_exchange_id_sz \
+                                (compound_encode_hdr_maxsz + \
+                                 encode_exchange_id_maxsz)
+#define NFS4_dec_exchange_id_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_exchange_id_maxsz)
+#define NFS4_enc_create_session_sz \
+                                (compound_encode_hdr_maxsz + \
+                                 encode_create_session_maxsz)
+#define NFS4_dec_create_session_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_create_session_maxsz)
+#define NFS4_enc_destroy_session_sz     (compound_encode_hdr_maxsz + \
+                                         encode_destroy_session_maxsz)
+#define NFS4_dec_destroy_session_sz     (compound_decode_hdr_maxsz + \
+                                         decode_destroy_session_maxsz)
+#define NFS4_enc_sequence_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 encode_sequence_maxsz)
+#define NFS4_dec_sequence_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_sequence_maxsz)
+#define NFS4_enc_get_lease_time_sz      (compound_encode_hdr_maxsz + \
+                                         encode_sequence_maxsz + \
+                                         encode_putrootfh_maxsz + \
+                                         encode_fsinfo_maxsz)
+#define NFS4_dec_get_lease_time_sz      (compound_decode_hdr_maxsz + \
+                                         decode_sequence_maxsz + \
+                                         decode_putrootfh_maxsz + \
+                                         decode_fsinfo_maxsz)
+#endif /* CONFIG_NFS_V4_1 */
 static const umode_t nfs_type2fmt[] = {
        [NF4BAD] = 0,
@@ -541,6 +698,8 @@ struct compound_hdr {
        __be32 *        nops_p;
        uint32_t        taglen;
        char *          tag;
+        uint32_t        replen;         /* expected reply words */
+        u32             minorversion;
 };
 /*
@@ -576,22 +735,31 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
        xdr_encode_opaque(p, str, len);
 }
-static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+static void encode_compound_hdr(struct xdr_stream *xdr,
+                                struct rpc_rqst *req,
+                                struct compound_hdr *hdr)
 {
        __be32 *p;
+        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        /* initialize running count of expected bytes in reply.
+         * NOTE: the replied tag SHOULD be the same is the one sent,
+         * but this is not required as a MUST for the server to do so. */
+        hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
        dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
        BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
        RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
        WRITE32(hdr->taglen);
        WRITEMEM(hdr->tag, hdr->taglen);
-        WRITE32(NFS4_MINOR_VERSION);
+        WRITE32(hdr->minorversion);
        hdr->nops_p = p;
        WRITE32(hdr->nops);
 }
 static void encode_nops(struct compound_hdr *hdr)
 {
+        BUG_ON(hdr->nops > NFS4_MAX_OPS);
        *hdr->nops_p = htonl(hdr->nops);
 }
@@ -736,6 +904,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
        WRITE32(OP_ACCESS);
        WRITE32(access);
        hdr->nops++;
+        hdr->replen += decode_access_maxsz;
 }
 static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
@@ -747,6 +916,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
        WRITE32(arg->seqid->sequence->counter);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        hdr->nops++;
+        hdr->replen += decode_close_maxsz;
 }
 static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
@@ -758,6 +928,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
        WRITE64(args->offset);
        WRITE32(args->count);
        hdr->nops++;
+        hdr->replen += decode_commit_maxsz;
 }
 static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
@@ -789,6 +960,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
        WRITE32(create->name->len);
        WRITEMEM(create->name->name, create->name->len);
        hdr->nops++;
+        hdr->replen += decode_create_maxsz;
        encode_attrs(xdr, create->attrs, create->server);
 }
@@ -802,6 +974,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
        WRITE32(1);
        WRITE32(bitmap);
        hdr->nops++;
+        hdr->replen += decode_getattr_maxsz;
 }
 static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
@@ -814,6 +987,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
        WRITE32(bm0);
        WRITE32(bm1);
        hdr->nops++;
+        hdr->replen += decode_getattr_maxsz;
 }
 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -841,6 +1015,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        RESERVE_SPACE(4);
        WRITE32(OP_GETFH);
        hdr->nops++;
+        hdr->replen += decode_getfh_maxsz;
 }
 static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
@@ -852,6 +1027,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
        hdr->nops++;
+        hdr->replen += decode_link_maxsz;
 }
 static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -899,6 +1075,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
                WRITE32(args->lock_seqid->sequence->counter);
        }
        hdr->nops++;
+        hdr->replen += decode_lock_maxsz;
 }
 static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
@@ -915,6 +1092,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
        WRITEMEM("lock id:", 8);
        WRITE64(args->lock_owner.id);
        hdr->nops++;
+        hdr->replen += decode_lockt_maxsz;
 }
 static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
@@ -929,6 +1107,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
        WRITE64(args->fl->fl_start);
        WRITE64(nfs4_lock_length(args->fl));
        hdr->nops++;
+        hdr->replen += decode_locku_maxsz;
 }
 static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
@@ -941,6 +1120,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
        WRITE32(len);
        WRITEMEM(name->name, len);
        hdr->nops++;
+        hdr->replen += decode_lookup_maxsz;
 }
 static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
@@ -1080,6 +1260,7 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,
                BUG();
        }
        hdr->nops++;
+        hdr->replen += decode_open_maxsz;
 }
 static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
@@ -1091,6 +1272,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        WRITE32(arg->seqid->sequence->counter);
        hdr->nops++;
+        hdr->replen += decode_open_confirm_maxsz;
 }
 static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
@@ -1103,6 +1285,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
        WRITE32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
        hdr->nops++;
+        hdr->replen += decode_open_downgrade_maxsz;
 }
 static void
@@ -1116,6 +1299,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
        WRITE32(len);
        WRITEMEM(fh->data, len);
        hdr->nops++;
+        hdr->replen += decode_putfh_maxsz;
 }
 static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
@@ -1125,6 +1309,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        RESERVE_SPACE(4);
        WRITE32(OP_PUTROOTFH);
        hdr->nops++;
+        hdr->replen += decode_putrootfh_maxsz;
 }
 static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
@@ -1153,6 +1338,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
        WRITE64(args->offset);
        WRITE32(args->count);
        hdr->nops++;
+        hdr->replen += decode_read_maxsz;
 }
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1178,6 +1364,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
        WRITE32(attrs[0] & readdir->bitmask[0]);
        WRITE32(attrs[1] & readdir->bitmask[1]);
        hdr->nops++;
+        hdr->replen += decode_readdir_maxsz;
        dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
                        __func__,
                        (unsigned long long)readdir->cookie,
@@ -1194,6 +1381,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
        RESERVE_SPACE(4);
        WRITE32(OP_READLINK);
        hdr->nops++;
+        hdr->replen += decode_readlink_maxsz;
 }
 static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
@@ -1205,6 +1393,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
        hdr->nops++;
+        hdr->replen += decode_remove_maxsz;
 }
 static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
@@ -1220,6 +1409,7 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
        WRITE32(newname->len);
        WRITEMEM(newname->name, newname->len);
        hdr->nops++;
+        hdr->replen += decode_rename_maxsz;
 }
 static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
@@ -1230,6 +1420,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
        WRITE32(OP_RENEW);
        WRITE64(client_stateid->cl_clientid);
        hdr->nops++;
+        hdr->replen += decode_renew_maxsz;
 }
 static void
@@ -1240,6 +1431,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        RESERVE_SPACE(4);
        WRITE32(OP_RESTOREFH);
        hdr->nops++;
+        hdr->replen += decode_restorefh_maxsz;
 }
 static int
@@ -1259,6 +1451,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
        WRITE32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
        hdr->nops++;
+        hdr->replen += decode_setacl_maxsz;
        return 0;
 }
@@ -1270,6 +1463,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        RESERVE_SPACE(4);
        WRITE32(OP_SAVEFH);
        hdr->nops++;
+        hdr->replen += decode_savefh_maxsz;
 }
 static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
@@ -1280,6 +1474,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
        WRITE32(OP_SETATTR);
        WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
        hdr->nops++;
+        hdr->replen += decode_setattr_maxsz;
        encode_attrs(xdr, arg->iap, server);
 }
@@ -1299,6 +1494,7 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
        RESERVE_SPACE(4);
        WRITE32(setclientid->sc_cb_ident);
        hdr->nops++;
+        hdr->replen += decode_setclientid_maxsz;
 }
 static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
@@ -1310,6 +1506,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
        WRITE64(client_state->cl_clientid);
        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
        hdr->nops++;
+        hdr->replen += decode_setclientid_confirm_maxsz;
 }
 static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
@@ -1328,6 +1525,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
        hdr->nops++;
+        hdr->replen += decode_write_maxsz;
 }
 static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
@@ -1339,11 +1537,163 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
        WRITE32(OP_DELEGRETURN);
        WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
        hdr->nops++;
+        hdr->replen += decode_delegreturn_maxsz;
+}
+#if defined(CONFIG_NFS_V4_1)
+/* NFSv4.1 operations */
+static void encode_exchange_id(struct xdr_stream *xdr,
+                               struct nfs41_exchange_id_args *args,
+                               struct compound_hdr *hdr)
+{
+        __be32 *p;
+        RESERVE_SPACE(4 + sizeof(args->verifier->data));
+        WRITE32(OP_EXCHANGE_ID);
+        WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+        encode_string(xdr, args->id_len, args->id);
+        RESERVE_SPACE(12);
+        WRITE32(args->flags);
+        WRITE32(0);     /* zero length state_protect4_a */
+        WRITE32(0);     /* zero length implementation id array */
+        hdr->nops++;
+        hdr->replen += decode_exchange_id_maxsz;
+}
+static void encode_create_session(struct xdr_stream *xdr,
+                                  struct nfs41_create_session_args *args,
+                                  struct compound_hdr *hdr)
+{
+        __be32 *p;
+        char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
+        uint32_t len;
+        struct nfs_client *clp = args->client;
+        RESERVE_SPACE(4);
+        WRITE32(OP_CREATE_SESSION);
+        RESERVE_SPACE(8);
+        WRITE64(clp->cl_ex_clid);
+        RESERVE_SPACE(8);
+        WRITE32(clp->cl_seqid);                 /*Sequence id */
+        WRITE32(args->flags);                   /*flags */
+        RESERVE_SPACE(2*28);                    /* 2 channel_attrs */
+        /* Fore Channel */
+        WRITE32(args->fc_attrs.headerpadsz);    /* header padding size */
+        WRITE32(args->fc_attrs.max_rqst_sz);    /* max req size */
+        WRITE32(args->fc_attrs.max_resp_sz);    /* max resp size */
+        WRITE32(args->fc_attrs.max_resp_sz_cached);     /* Max resp sz cached */
+        WRITE32(args->fc_attrs.max_ops);        /* max operations */
+        WRITE32(args->fc_attrs.max_reqs);       /* max requests */
+        WRITE32(0);                             /* rdmachannel_attrs */
+        /* Back Channel */
+        WRITE32(args->fc_attrs.headerpadsz);    /* header padding size */
+        WRITE32(args->bc_attrs.max_rqst_sz);    /* max req size */
+        WRITE32(args->bc_attrs.max_resp_sz);    /* max resp size */
+        WRITE32(args->bc_attrs.max_resp_sz_cached);     /* Max resp sz cached */
+        WRITE32(args->bc_attrs.max_ops);        /* max operations */
+        WRITE32(args->bc_attrs.max_reqs);       /* max requests */
+        WRITE32(0);                             /* rdmachannel_attrs */
+        RESERVE_SPACE(4);
+        WRITE32(args->cb_program);              /* cb_program */
+        RESERVE_SPACE(4);                       /* # of security flavors */
+        WRITE32(1);
+        RESERVE_SPACE(4);
+        WRITE32(RPC_AUTH_UNIX);                 /* auth_sys */
+        /* authsys_parms rfc1831 */
+        RESERVE_SPACE(4);
+        WRITE32((u32)clp->cl_boot_time.tv_nsec);        /* stamp */
+        len = scnprintf(machine_name, sizeof(machine_name), "%s",
+                        clp->cl_ipaddr);
+        RESERVE_SPACE(16 + len);
+        WRITE32(len);
+        WRITEMEM(machine_name, len);
+        WRITE32(0);                             /* UID */
+        WRITE32(0);                             /* GID */
+        WRITE32(0);                             /* No more gids */
+        hdr->nops++;
+        hdr->replen += decode_create_session_maxsz;
+}
+static void encode_destroy_session(struct xdr_stream *xdr,
+                                   struct nfs4_session *session,
+                                   struct compound_hdr *hdr)
+{
+        __be32 *p;
+        RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
+        WRITE32(OP_DESTROY_SESSION);
+        WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+        hdr->nops++;
+        hdr->replen += decode_destroy_session_maxsz;
 }
+#endif /* CONFIG_NFS_V4_1 */
+static void encode_sequence(struct xdr_stream *xdr,
+                            const struct nfs4_sequence_args *args,
+                            struct compound_hdr *hdr)
+{
+#if defined(CONFIG_NFS_V4_1)
+        struct nfs4_session *session = args->sa_session;
+        struct nfs4_slot_table *tp;
+        struct nfs4_slot *slot;
+        __be32 *p;
+        if (!session)
+                return;
+        tp = &session->fc_slot_table;
+        WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
+        slot = tp->slots + args->sa_slotid;
+        RESERVE_SPACE(4);
+        WRITE32(OP_SEQUENCE);
+        /*
+         * Sessionid + seqid + slotid + max slotid + cache_this
+         */
+        dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d "
+                "max_slotid=%d cache_this=%d\n",
+                __func__,
+                ((u32 *)session->sess_id.data)[0],
+                ((u32 *)session->sess_id.data)[1],
+                ((u32 *)session->sess_id.data)[2],
+                ((u32 *)session->sess_id.data)[3],
+                slot->seq_nr, args->sa_slotid,
+                tp->highest_used_slotid, args->sa_cache_this);
+        RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
+        WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+        WRITE32(slot->seq_nr);
+        WRITE32(args->sa_slotid);
+        WRITE32(tp->highest_used_slotid);
+        WRITE32(args->sa_cache_this);
+        hdr->nops++;
+        hdr->replen += decode_sequence_maxsz;
+#endif /* CONFIG_NFS_V4_1 */
+}
 /*
 * END OF "GENERIC" ENCODE ROUTINES.
 */
+static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
+{
+#if defined(CONFIG_NFS_V4_1)
+        if (args->sa_session)
+                return args->sa_session->clp->cl_minorversion;
+#endif /* CONFIG_NFS_V4_1 */
+        return 0;
+}
 /*
 * Encode an ACCESS request
 */
@@ -1351,11 +1701,12 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_access(&xdr, args->access, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1370,11 +1721,12 @@ static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->dir_fh, &hdr);
        encode_lookup(&xdr, args->name, &hdr);
        encode_getfh(&xdr, &hdr);
@@ -1390,11 +1742,12 @@ static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putrootfh(&xdr, &hdr);
        encode_getfh(&xdr, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1409,11 +1762,12 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_remove(&xdr, &args->name, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1428,11 +1782,12 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->old_dir, &hdr);
        encode_savefh(&xdr, &hdr);
        encode_putfh(&xdr, args->new_dir, &hdr);
@@ -1451,11 +1806,12 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_savefh(&xdr, &hdr);
        encode_putfh(&xdr, args->dir_fh, &hdr);
@@ -1474,11 +1830,12 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->dir_fh, &hdr);
        encode_savefh(&xdr, &hdr);
        encode_create(&xdr, args, &hdr);
@@ -1505,11 +1862,12 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
@@ -1523,11 +1881,12 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closea
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_close(&xdr, args, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1542,11 +1901,12 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openarg
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_savefh(&xdr, &hdr);
        encode_open(&xdr, args, &hdr);
@@ -1569,7 +1929,7 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_open_confirm(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -1583,11 +1943,12 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_open(&xdr, args, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1602,11 +1963,12 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_open_downgrade(&xdr, args, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1621,11 +1983,12 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_ar
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_lock(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -1639,11 +2002,12 @@ static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_lockt(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -1657,11 +2021,12 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_locku(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -1675,22 +2040,16 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
-        unsigned int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_readlink(&xdr, args, req, &hdr);
-        /* set up reply kvec
+        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
-         *    toplevel_status + taglen + rescount + OP_PUTFH + status
-         *      + OP_READLINK + status + string length = 8
-         */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
                        args->pgbase, args->pglen);
        encode_nops(&hdr);
        return 0;
@@ -1703,25 +2062,19 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
-        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_readdir(&xdr, args, req, &hdr);
-        /* set up reply kvec
+        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
-         *    toplevel_status + taglen + rescount + OP_PUTFH + status
-         *      + OP_READDIR + status + verifer(2)  = 9
-         */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readdir_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
                         args->pgbase, args->count);
        dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
-                        __func__, replen, args->pages,
+                        __func__, hdr.replen << 2, args->pages,
                        args->pgbase, args->count);
        encode_nops(&hdr);
        return 0;
@@ -1732,24 +2085,18 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 */
 static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_read(&xdr, args, &hdr);
-        /* set up reply kvec
+        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
-         *    toplevel status + taglen=0 + rescount + OP_PUTFH + status
-         *       + OP_READ + status + eof + datalen = 9
-         */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
        encode_nops(&hdr);
@@ -1763,11 +2110,12 @@ static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_seta
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_setattr(&xdr, args, args->server, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1783,20 +2131,19 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
                struct nfs_getaclargs *args)
 {
        struct xdr_stream xdr;
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        int replen;
+        uint32_t replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
+        replen = hdr.replen + nfs4_fattr_bitmap_maxsz + 1;
        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
-        /* set up reply buffer: */
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
                args->acl_pages, args->acl_pgbase, args->acl_len);
        encode_nops(&hdr);
        return 0;
@@ -1809,11 +2156,12 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_write(&xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
@@ -1829,11 +2177,12 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_write
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_commit(&xdr, args, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1848,11 +2197,12 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsin
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_fsinfo(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
@@ -1866,11 +2216,12 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
                           &hdr);
@@ -1885,11 +2236,12 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
@@ -1900,16 +2252,18 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
 /*
 * GETATTR_BITMAP request
 */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle)
+static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
+                                    struct nfs4_server_caps_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
-        encode_putfh(&xdr, fhandle, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(&xdr, args->fhandle, &hdr);
        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
                           FATTR4_WORD0_LINK_SUPPORT|
                           FATTR4_WORD0_SYMLINK_SUPPORT|
@@ -1929,7 +2283,7 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
        encode_renew(&xdr, clp, &hdr);
        encode_nops(&hdr);
        return 0;
@@ -1946,7 +2300,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
        encode_setclientid(&xdr, sc, &hdr);
        encode_nops(&hdr);
        return 0;
@@ -1964,7 +2318,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
        encode_setclientid_confirm(&xdr, clp, &hdr);
        encode_putrootfh(&xdr, &hdr);
        encode_fsinfo(&xdr, lease_bitmap, &hdr);
@@ -1979,11 +2333,12 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fhandle, &hdr);
        encode_delegreturn(&xdr, args->stateid, &hdr);
        encode_getfattr(&xdr, args->bitmask, &hdr);
@@ -1998,28 +2353,119 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        uint32_t replen;
-        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->dir_fh, &hdr);
        encode_lookup(&xdr, args->name, &hdr);
+        replen = hdr.replen;    /* get the attribute into args->page */
        encode_fs_locations(&xdr, args->bitmask, &hdr);
-        /* set up reply
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
-         *   toplevel_status + OP_PUTFH + status
-         *   + OP_LOOKUP + status + OP_GETATTR + status = 7
-         */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
                        0, PAGE_SIZE);
        encode_nops(&hdr);
        return 0;
 }
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * EXCHANGE_ID request
+ */
+static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
+                                    struct nfs41_exchange_id_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = args->client->cl_minorversion,
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_exchange_id(&xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ * a CREATE_SESSION request
+ */
+static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
+                                       struct nfs41_create_session_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = args->client->cl_minorversion,
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_create_session(&xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ * a DESTROY_SESSION request
+ */
+static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
+                                        struct nfs4_session *session)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = session->clp->cl_minorversion,
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_destroy_session(&xdr, session, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ * a SEQUENCE request
+ */
+static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
+                                 struct nfs4_sequence_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(args),
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ * a GET_LEASE_TIME request
+ */
+static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+                                       struct nfs4_get_lease_time_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+        };
+        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->la_seq_args, &hdr);
+        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(&xdr, lease_bitmap, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * START OF "GENERIC" DECODE ROUTINES.
 *   These may look a little ugly since they are imported from a "generic"
@@ -3657,7 +4103,7 @@ decode_savefh(struct xdr_stream *xdr)
        return decode_op_hdr(xdr, OP_SAVEFH);
 }
-static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
+static int decode_setattr(struct xdr_stream *xdr)
 {
        __be32 *p;
        uint32_t bmlen;
@@ -3735,6 +4181,169 @@ static int decode_delegreturn(struct xdr_stream *xdr)
        return decode_op_hdr(xdr, OP_DELEGRETURN);
 }
+#if defined(CONFIG_NFS_V4_1)
+static int decode_exchange_id(struct xdr_stream *xdr,
+                              struct nfs41_exchange_id_res *res)
+{
+        __be32 *p;
+        uint32_t dummy;
+        int status;
+        struct nfs_client *clp = res->client;
+        status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
+        if (status)
+                return status;
+        READ_BUF(8);
+        READ64(clp->cl_ex_clid);
+        READ_BUF(12);
+        READ32(clp->cl_seqid);
+        READ32(clp->cl_exchange_flags);
+        /* We ask for SP4_NONE */
+        READ32(dummy);
+        if (dummy != SP4_NONE)
+                return -EIO;
+        /* Throw away minor_id */
+        READ_BUF(8);
+        /* Throw away Major id */
+        READ_BUF(4);
+        READ32(dummy);
+        READ_BUF(dummy);
+        /* Throw away server_scope */
+        READ_BUF(4);
+        READ32(dummy);
+        READ_BUF(dummy);
+        /* Throw away Implementation id array */
+        READ_BUF(4);
+        READ32(dummy);
+        READ_BUF(dummy);
+        return 0;
+}
+static int decode_chan_attrs(struct xdr_stream *xdr,
+                             struct nfs4_channel_attrs *attrs)
+{
+        __be32 *p;
+        u32 nr_attrs;
+        READ_BUF(28);
+        READ32(attrs->headerpadsz);
+        READ32(attrs->max_rqst_sz);
+        READ32(attrs->max_resp_sz);
+        READ32(attrs->max_resp_sz_cached);
+        READ32(attrs->max_ops);
+        READ32(attrs->max_reqs);
+        READ32(nr_attrs);
+        if (unlikely(nr_attrs > 1)) {
+                printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
+                        __func__, nr_attrs);
+                return -EINVAL;
+        }
+        if (nr_attrs == 1)
+                READ_BUF(4); /* skip rdma_attrs */
+        return 0;
+}
+static int decode_create_session(struct xdr_stream *xdr,
+                                 struct nfs41_create_session_res *res)
+{
+        __be32 *p;
+        int status;
+        struct nfs_client *clp = res->client;
+        struct nfs4_session *session = clp->cl_session;
+        status = decode_op_hdr(xdr, OP_CREATE_SESSION);
+        if (status)
+                return status;
+        /* sessionid */
+        READ_BUF(NFS4_MAX_SESSIONID_LEN);
+        COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
+        /* seqid, flags */
+        READ_BUF(8);
+        READ32(clp->cl_seqid);
+        READ32(session->flags);
+        /* Channel attributes */
+        status = decode_chan_attrs(xdr, &session->fc_attrs);
+        if (!status)
+                status = decode_chan_attrs(xdr, &session->bc_attrs);
+        return status;
+}
+static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
+{
+        return decode_op_hdr(xdr, OP_DESTROY_SESSION);
+}
+#endif /* CONFIG_NFS_V4_1 */
+static int decode_sequence(struct xdr_stream *xdr,
+                           struct nfs4_sequence_res *res,
+                           struct rpc_rqst *rqstp)
+{
+#if defined(CONFIG_NFS_V4_1)
+        struct nfs4_slot *slot;
+        struct nfs4_sessionid id;
+        u32 dummy;
+        int status;
+        __be32 *p;
+        if (!res->sr_session)
+                return 0;
+        status = decode_op_hdr(xdr, OP_SEQUENCE);
+        if (status)
+                goto out_err;
+        /*
+         * If the server returns different values for sessionID, slotID or
+         * sequence number, the server is looney tunes.
+         */
+        status = -ESERVERFAULT;
+        slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
+        READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
+        COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
+        if (memcmp(id.data, res->sr_session->sess_id.data,
+                   NFS4_MAX_SESSIONID_LEN)) {
+                dprintk("%s Invalid session id\n", __func__);
+                goto out_err;
+        }
+        /* seqid */
+        READ32(dummy);
+        if (dummy != slot->seq_nr) {
+                dprintk("%s Invalid sequence number\n", __func__);
+                goto out_err;
+        }
+        /* slot id */
+        READ32(dummy);
+        if (dummy != res->sr_slotid) {
+                dprintk("%s Invalid slot id\n", __func__);
+                goto out_err;
+        }
+        /* highest slot id - currently not processed */
+        READ32(dummy);
+        /* target highest slot id - currently not processed */
+        READ32(dummy);
+        /* result flags - currently not processed */
+        READ32(dummy);
+        status = 0;
+out_err:
+        res->sr_status = status;
+        return status;
+#else  /* CONFIG_NFS_V4_1 */
+        return 0;
+#endif /* CONFIG_NFS_V4_1 */
+}
 /*
 * END OF "GENERIC" DECODE ROUTINES.
 */
@@ -3752,6 +4361,9 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -3773,7 +4385,11 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        status = decode_putfh(&xdr);
        if (status != 0)
@@ -3796,7 +4412,11 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3819,7 +4439,11 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putrootfh(&xdr)) != 0)
                goto out;
@@ -3839,7 +4463,11 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3860,7 +4488,11 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3890,7 +4522,11 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3923,7 +4559,11 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
        int status;
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -3963,6 +4603,9 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -3979,12 +4622,13 @@ nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 0,
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
        status = encode_setacl(&xdr, args, &hdr);
        encode_nops(&hdr);
@@ -3995,7 +4639,8 @@ nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args
 * Decode SETACL response
 */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+                    struct nfs_setaclres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4005,10 +4650,13 @@ nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr, res);
+        status = decode_setattr(&xdr);
 out:
        return status;
 }
@@ -4017,7 +4665,8 @@ out:
 * Decode GETACL response
 */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+                    struct nfs_getaclres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4027,10 +4676,13 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
-        status = decode_getacl(&xdr, rqstp, acl_len);
+        status = decode_getacl(&xdr, rqstp, &res->acl_len);
 out:
        return status;
@@ -4049,6 +4701,9 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4079,6 +4734,9 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4133,6 +4791,9 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4157,10 +4818,13 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr, res);
+        status = decode_setattr(&xdr);
        if (status)
                goto out;
        decode_getfattr(&xdr, res->fattr, res->server);
@@ -4181,6 +4845,9 @@ static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4202,6 +4869,9 @@ static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4223,6 +4893,9 @@ static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4234,7 +4907,8 @@ out:
 /*
 * Decode READLINK response
 */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+                                 struct nfs4_readlink_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4244,6 +4918,9 @@ static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4265,6 +4942,9 @@ static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_r
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4286,6 +4966,9 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readr
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4309,6 +4992,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4335,6 +5021,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
        status = decode_putfh(&xdr);
        if (status)
                goto out;
@@ -4349,7 +5038,8 @@ out:
 /*
 * FSINFO request
 */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+                               struct nfs4_fsinfo_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4358,16 +5048,19 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
+                status = decode_sequence(&xdr, &res->seq_res, req);
+        if (!status)
                status = decode_putfh(&xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, fsinfo);
+                status = decode_fsinfo(&xdr, res->fsinfo);
        return status;
 }
 /*
 * PATHCONF request
 */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf)
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+                                 struct nfs4_pathconf_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4376,16 +5069,19 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pat
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
+                status = decode_sequence(&xdr, &res->seq_res, req);
+        if (!status)
                status = decode_putfh(&xdr);
        if (!status)
-                status = decode_pathconf(&xdr, pathconf);
+                status = decode_pathconf(&xdr, res->pathconf);
        return status;
 }
 /*
 * STATFS request
 */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat)
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+                               struct nfs4_statfs_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4394,9 +5090,11 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fssta
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
+                status = decode_sequence(&xdr, &res->seq_res, req);
+        if (!status)
                status = decode_putfh(&xdr);
        if (!status)
-                status = decode_statfs(&xdr, fsstat);
+                status = decode_statfs(&xdr, res->fsstat);
        return status;
 }
@@ -4410,7 +5108,11 @@ static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4
        int status;
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
-        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, req);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
@@ -4483,7 +5185,10 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
-        if (status != 0)
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
                goto out;
        status = decode_putfh(&xdr);
        if (status != 0)
@@ -4497,7 +5202,8 @@ out:
 /*
 * FS_LOCATIONS request
 */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+                                     struct nfs4_fs_locations_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4505,18 +5211,113 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
-        if (status != 0)
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, req);
+        if (status)
                goto out;
        if ((status = decode_putfh(&xdr)) != 0)
                goto out;
        if ((status = decode_lookup(&xdr)) != 0)
                goto out;
        xdr_enter_page(&xdr, PAGE_SIZE);
-        status = decode_getfattr(&xdr, &res->fattr, res->server);
+        status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+                                 res->fs_locations->server);
 out:
        return status;
 }
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * EXCHANGE_ID request
+ */
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+                                    void *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_exchange_id(&xdr, res);
+        return status;
+}
+/*
+ * a CREATE_SESSION request
+ */
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+                                       struct nfs41_create_session_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_create_session(&xdr, res);
+        return status;
+}
+/*
+ * a DESTROY_SESSION request
+ */
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
+                                        void *dummy)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_destroy_session(&xdr, dummy);
+        return status;
+}
+/*
+ * a SEQUENCE request
+ */
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
+                                 struct nfs4_sequence_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_sequence(&xdr, res, rqstp);
+        return status;
+}
+/*
+ * a GET_LEASE_TIME request
+ */
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+                                       struct nfs4_get_lease_time_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+        if (!status)
+                status = decode_putrootfh(&xdr);
+        if (!status)
+                status = decode_fsinfo(&xdr, res->lr_fsinfo);
+        return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
 __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
        uint32_t bitmap[2] = {0};
@@ -4686,6 +5487,13 @@ struct rpc_procinfo	nfs4_procedures[] = {
  PROC(GETACL,          enc_getacl,     dec_getacl),
  PROC(SETACL,          enc_setacl,     dec_setacl),
  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
+#if defined(CONFIG_NFS_V4_1)
+  PROC(EXCHANGE_ID,     enc_exchange_id,        dec_exchange_id),
+  PROC(CREATE_SESSION,  enc_create_session,     dec_create_session),
+  PROC(DESTROY_SESSION, enc_destroy_session,    dec_destroy_session),
+  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
+  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
+#endif /* CONFIG_NFS_V4_1 */
 };
 struct rpc_version              nfs_version4 = {
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index e3ed5908820b..8c55b27c0de4 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -92,6 +92,9 @@
 #undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
+/* Default port to use if server is not running a portmapper */
+#define NFS_MNT_PORT    627
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT                "/tftpboot/%s"
@@ -487,6 +490,7 @@ static int __init root_nfs_get_handle(void)
 {
        struct nfs_fh fh;
        struct sockaddr_in sin;
+        unsigned int auth_flav_len = 0;
        struct nfs_mount_request request = {
                .sap            = (struct sockaddr *)&sin,
                .salen          = sizeof(sin),
@@ -496,6 +500,7 @@ static int __init root_nfs_get_handle(void)
                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
                .fh             = &fh,
+                .auth_flav_len  = &auth_flav_len,
        };
        int status;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4ace3c50a8eb..12c9e66d3f1d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,10 +18,10 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
-#include <linux/smp_lock.h>
 #include <asm/system.h>
+#include "nfs4_fs.h"
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
@@ -46,6 +46,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
+                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -59,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
        return p;
 }
-static void nfs_readdata_free(struct nfs_read_data *p)
+void nfs_readdata_free(struct nfs_read_data *p)
 {
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_rdata_mempool);
 }
-void nfs_readdata_release(void *data)
+static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
-        struct nfs_read_data *rdata = data;
        put_nfs_open_context(rdata->args.context);
        nfs_readdata_free(rdata);
 }
@@ -357,19 +356,25 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
        struct nfs_readres *resp = &data->res;
        if (resp->eof || resp->count == argp->count)
-                return;
+                goto out;
        /* This is a short read! */
        nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
        /* Has the server at least made some progress? */
        if (resp->count == 0)
-                return;
+                goto out;
        /* Yes, so retry the read at the end of the data */
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
-        rpc_restart_call(task);
+        nfs4_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
+        return;
+out:
+        nfs4_sequence_free_slot(NFS_SERVER(data->inode)->nfs_client,
+                                &data->res.seq_res);
+        return;
 }
 /*
@@ -406,7 +411,23 @@ static void nfs_readpage_release_partial(void *calldata)
        nfs_readdata_release(calldata);
 }
+#if defined(CONFIG_NFS_V4_1)
+void nfs_read_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs_read_data *data = calldata;
+        if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client,
+                                &data->args.seq_args, &data->res.seq_res,
+                                0, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_read_partial_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_readpage_result_partial,
        .rpc_release = nfs_readpage_release_partial,
 };
@@ -470,6 +491,9 @@ static void nfs_readpage_release_full(void *calldata)
 }
 static const struct rpc_call_ops nfs_read_full_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_readpage_result_full,
        .rpc_release = nfs_readpage_release_full,
 };
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2d67781c579..0b4cbdc60abd 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -42,6 +42,8 @@
 #include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
+#include <linux/mnt_namespace.h>
+#include <linux/namei.h>
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/inet.h>
@@ -90,6 +92,7 @@ enum {
        Opt_mountport,
        Opt_mountvers,
        Opt_nfsvers,
+        Opt_minorversion,
        /* Mount options that take string arguments */
        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
@@ -139,22 +142,23 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_fscache_uniq, "fsc=%s" },
        { Opt_nofscache, "nofsc" },
-        { Opt_port, "port=%u" },
+        { Opt_port, "port=%s" },
-        { Opt_rsize, "rsize=%u" },
+        { Opt_rsize, "rsize=%s" },
-        { Opt_wsize, "wsize=%u" },
+        { Opt_wsize, "wsize=%s" },
-        { Opt_bsize, "bsize=%u" },
+        { Opt_bsize, "bsize=%s" },
-        { Opt_timeo, "timeo=%u" },
+        { Opt_timeo, "timeo=%s" },
-        { Opt_retrans, "retrans=%u" },
+        { Opt_retrans, "retrans=%s" },
-        { Opt_acregmin, "acregmin=%u" },
+        { Opt_acregmin, "acregmin=%s" },
-        { Opt_acregmax, "acregmax=%u" },
+        { Opt_acregmax, "acregmax=%s" },
-        { Opt_acdirmin, "acdirmin=%u" },
+        { Opt_acdirmin, "acdirmin=%s" },
-        { Opt_acdirmax, "acdirmax=%u" },
+        { Opt_acdirmax, "acdirmax=%s" },
-        { Opt_actimeo, "actimeo=%u" },
+        { Opt_actimeo, "actimeo=%s" },
-        { Opt_namelen, "namlen=%u" },
+        { Opt_namelen, "namlen=%s" },
-        { Opt_mountport, "mountport=%u" },
+        { Opt_mountport, "mountport=%s" },
-        { Opt_mountvers, "mountvers=%u" },
+        { Opt_mountvers, "mountvers=%s" },
-        { Opt_nfsvers, "nfsvers=%u" },
+        { Opt_nfsvers, "nfsvers=%s" },
-        { Opt_nfsvers, "vers=%u" },
+        { Opt_nfsvers, "vers=%s" },
+        { Opt_minorversion, "minorversion=%u" },
        { Opt_sec, "sec=%s" },
        { Opt_proto, "proto=%s" },
@@ -270,10 +274,14 @@ static const struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs4_remote_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static int nfs4_referral_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
@@ -284,6 +292,14 @@ static struct file_system_type nfs4_fs_type = {
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
+static struct file_system_type nfs4_remote_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs4_remote_get_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
 struct file_system_type nfs4_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
@@ -292,6 +308,14 @@ struct file_system_type nfs4_xdev_fs_type = {
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
+static struct file_system_type nfs4_remote_referral_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs4_remote_referral_get_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
 struct file_system_type nfs4_referral_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
@@ -514,7 +538,6 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                const char *nostr;
        } nfs_info[] = {
                { NFS_MOUNT_SOFT, ",soft", ",hard" },
-                { NFS_MOUNT_INTR, ",intr", ",nointr" },
                { NFS_MOUNT_POSIX, ",posix", "" },
                { NFS_MOUNT_NOCTO, ",nocto", "" },
                { NFS_MOUNT_NOAC, ",noac", "" },
@@ -943,11 +966,6 @@ static int nfs_parse_security_flavors(char *value,
        return 1;
 }
-static void nfs_parse_invalid_value(const char *option)
-{
-        dfprintk(MOUNT, "NFS:   bad value specified for %s option\n", option);
-}
 /*
 * Error-check and convert a string of mount options from user space into
 * a data structure.  The whole mount string is processed; bad options are
@@ -958,7 +976,7 @@ static int nfs_parse_mount_options(char *raw,
                                   struct nfs_parsed_mount_data *mnt)
 {
        char *p, *string, *secdata;
-        int rc, sloppy = 0, errors = 0;
+        int rc, sloppy = 0, invalid_option = 0;
        if (!raw) {
                dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -982,7 +1000,9 @@ static int nfs_parse_mount_options(char *raw,
        while ((p = strsep(&raw, ",")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
-                int option, token;
+                unsigned long option;
+                int int_option;
+                int token;
                if (!*p)
                        continue;
@@ -1091,114 +1111,156 @@ static int nfs_parse_mount_options(char *raw,
                 * options that take numeric values
                 */
                case Opt_port:
-                        if (match_int(args, &option) ||
+                        string = match_strdup(args);
-                            option < 0 || option > USHORT_MAX) {
+                        if (string == NULL)
-                                errors++;
+                                goto out_nomem;
-                                nfs_parse_invalid_value("port");
+                        rc = strict_strtoul(string, 10, &option);
-                        } else
+                        kfree(string);
-                                mnt->nfs_server.port = option;
+                        if (rc != 0 || option > USHORT_MAX)
+                                goto out_invalid_value;
+                        mnt->nfs_server.port = option;
                        break;
                case Opt_rsize:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("rsize");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->rsize = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->rsize = option;
                        break;
                case Opt_wsize:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("wsize");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->wsize = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->wsize = option;
                        break;
                case Opt_bsize:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("bsize");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->bsize = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->bsize = option;
                        break;
                case Opt_timeo:
-                        if (match_int(args, &option) || option <= 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("timeo");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->timeo = option;
+                        kfree(string);
+                        if (rc != 0 || option == 0)
+                                goto out_invalid_value;
+                        mnt->timeo = option;
                        break;
                case Opt_retrans:
-                        if (match_int(args, &option) || option <= 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("retrans");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->retrans = option;
+                        kfree(string);
+                        if (rc != 0 || option == 0)
+                                goto out_invalid_value;
+                        mnt->retrans = option;
                        break;
                case Opt_acregmin:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("acregmin");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acregmin = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acregmin = option;
                        break;
                case Opt_acregmax:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("acregmax");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acregmax = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acregmax = option;
                        break;
                case Opt_acdirmin:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("acdirmin");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acdirmin = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acdirmin = option;
                        break;
                case Opt_acdirmax:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("acdirmax");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acdirmax = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acdirmax = option;
                        break;
                case Opt_actimeo:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("actimeo");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->acregmin = mnt->acregmax =
+                        kfree(string);
-                                mnt->acdirmin = mnt->acdirmax = option;
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->acregmin = mnt->acregmax =
+                        mnt->acdirmin = mnt->acdirmax = option;
                        break;
                case Opt_namelen:
-                        if (match_int(args, &option) || option < 0) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("namlen");
+                                goto out_nomem;
-                        } else
+                        rc = strict_strtoul(string, 10, &option);
-                                mnt->namlen = option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        mnt->namlen = option;
                        break;
                case Opt_mountport:
-                        if (match_int(args, &option) ||
+                        string = match_strdup(args);
-                            option < 0 || option > USHORT_MAX) {
+                        if (string == NULL)
-                                errors++;
+                                goto out_nomem;
-                                nfs_parse_invalid_value("mountport");
+                        rc = strict_strtoul(string, 10, &option);
-                        } else
+                        kfree(string);
-                                mnt->mount_server.port = option;
+                        if (rc != 0 || option > USHORT_MAX)
+                                goto out_invalid_value;
+                        mnt->mount_server.port = option;
                        break;
                case Opt_mountvers:
-                        if (match_int(args, &option) ||
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        rc = strict_strtoul(string, 10, &option);
+                        kfree(string);
+                        if (rc != 0 ||
                            option < NFS_MNT_VERSION ||
-                            option > NFS_MNT3_VERSION) {
+                            option > NFS_MNT3_VERSION)
-                                errors++;
+                                goto out_invalid_value;
-                                nfs_parse_invalid_value("mountvers");
+                        mnt->mount_server.version = option;
-                        } else
-                                mnt->mount_server.version = option;
                        break;
                case Opt_nfsvers:
-                        if (match_int(args, &option)) {
+                        string = match_strdup(args);
-                                errors++;
+                        if (string == NULL)
-                                nfs_parse_invalid_value("nfsvers");
+                                goto out_nomem;
-                                break;
+                        rc = strict_strtoul(string, 10, &option);
-                        }
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
                        switch (option) {
                        case NFS2_VERSION:
                                mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1207,10 +1269,16 @@ static int nfs_parse_mount_options(char *raw,
                                mnt->flags |= NFS_MOUNT_VER3;
                                break;
                        default:
-                                errors++;
+                                goto out_invalid_value;
-                                nfs_parse_invalid_value("nfsvers");
                        }
                        break;
+                case Opt_minorversion:
+                        if (match_int(args, &int_option))
+                                return 0;
+                        if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION)
+                                return 0;
+                        mnt->minorversion = int_option;
+                        break;
                /*
                 * options that take text values
@@ -1222,9 +1290,9 @@ static int nfs_parse_mount_options(char *raw,
                        rc = nfs_parse_security_flavors(string, mnt);
                        kfree(string);
                        if (!rc) {
-                                errors++;
                                dfprintk(MOUNT, "NFS:   unrecognized "
                                                "security flavor\n");
+                                return 0;
                        }
                        break;
                case Opt_proto:
@@ -1238,23 +1306,25 @@ static int nfs_parse_mount_options(char *raw,
                        case Opt_xprt_udp:
                                mnt->flags &= ~NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+                                kfree(string);
                                break;
                        case Opt_xprt_tcp:
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+                                kfree(string);
                                break;
                        case Opt_xprt_rdma:
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
                                xprt_load_transport(string);
+                                kfree(string);
                                break;
                        default:
-                                errors++;
                                dfprintk(MOUNT, "NFS:   unrecognized "
                                                "transport protocol\n");
+                                return 0;
                        }
-                        kfree(string);
                        break;
                case Opt_mountproto:
                        string = match_strdup(args);
@@ -1273,9 +1343,9 @@ static int nfs_parse_mount_options(char *raw,
                                break;
                        case Opt_xprt_rdma: /* not used for side protocols */
                        default:
-                                errors++;
                                dfprintk(MOUNT, "NFS:   unrecognized "
                                                "transport protocol\n");
+                                return 0;
                        }
                        break;
                case Opt_addr:
@@ -1331,9 +1401,9 @@ static int nfs_parse_mount_options(char *raw,
                                        mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
                                        break;
                                default:
-                                        errors++;
                                        dfprintk(MOUNT, "NFS:   invalid "
                                                        "lookupcache argument\n");
+                                        return 0;
                        };
                        break;
@@ -1351,20 +1421,20 @@ static int nfs_parse_mount_options(char *raw,
                        break;
                default:
-                        errors++;
+                        invalid_option = 1;
                        dfprintk(MOUNT, "NFS:   unrecognized mount option "
                                        "'%s'\n", p);
                }
        }
-        if (errors > 0) {
+        if (!sloppy && invalid_option)
-                dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
+                return 0;
-                                errors, (errors == 1 ? "" : "s"));
-                if (!sloppy)
-                        return 0;
-        }
        return 1;
+out_invalid_value:
+        printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p);
+        return 0;
 out_nomem:
        printk(KERN_INFO "NFS: not enough memory to parse option\n");
        return 0;
@@ -1381,6 +1451,7 @@ out_security_failure:
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                         struct nfs_fh *root_fh)
 {
+        unsigned int auth_flavor_len = 0;
        struct nfs_mount_request request = {
                .sap            = (struct sockaddr *)
                                                &args->mount_server.address,
@@ -1388,6 +1459,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                .protocol       = args->mount_server.protocol,
                .fh             = root_fh,
                .noresvport     = args->flags & NFS_MOUNT_NORESVPORT,
+                .auth_flav_len  = &auth_flavor_len,
        };
        int status;
@@ -1813,6 +1885,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        if (data == NULL)
                return -ENOMEM;
+        lock_kernel();
        /* fill out struct with values from existing mount */
        data->flags = nfss->flags;
        data->rsize = nfss->rsize;
@@ -1837,6 +1910,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        error = nfs_compare_remount_data(nfss, data);
 out:
        kfree(data);
+        unlock_kernel();
        return error;
 }
@@ -2238,6 +2312,11 @@ static void nfs4_fill_super(struct super_block *sb)
        nfs_initialise_sb(sb);
 }
+static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
+{
+        args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
+}
 /*
 * Validate NFSv4 mount options
 */
@@ -2261,6 +2340,7 @@ static int nfs4_validate_mount_data(void *options,
        args->nfs_server.port   = NFS_PORT; /* 2049 unless user set port= */
        args->auth_flavors[0]   = RPC_AUTH_UNIX;
        args->auth_flavor_len   = 0;
+        args->minorversion      = 0;
        switch (data->version) {
        case 1:
@@ -2334,6 +2414,8 @@ static int nfs4_validate_mount_data(void *options,
                nfs_validate_transport_protocol(args);
+                nfs4_validate_mount_flags(args);
                if (args->auth_flavor_len > 1)
                        goto out_inval_auth;
@@ -2373,12 +2455,12 @@ out_no_client_address:
 }
 /*
- * Get the superblock for an NFS4 mountpoint
+ * Get the superblock for the NFS4 root partition
 */
-static int nfs4_get_sb(struct file_system_type *fs_type,
+static int nfs4_remote_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-        struct nfs_parsed_mount_data *data;
+        struct nfs_parsed_mount_data *data = raw_data;
        struct super_block *s;
        struct nfs_server *server;
        struct nfs_fh *mntfh;
@@ -2389,18 +2471,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        };
        int error = -ENOMEM;
-        data = kzalloc(sizeof(*data), GFP_KERNEL);
        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
        security_init_mnt_opts(&data->lsm_opts);
-        /* Validate the mount data */
-        error = nfs4_validate_mount_data(raw_data, data, dev_name);
-        if (error < 0)
-                goto out;
        /* Get a volume representation */
        server = nfs4_create_server(data, mntfh);
        if (IS_ERR(server)) {
@@ -2413,7 +2489,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
                compare_super = NULL;
        /* Get a superblock - note that we may end up sharing one that already exists */
-        s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
+        s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
                error = PTR_ERR(s);
                goto out_free;
@@ -2450,14 +2526,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        error = 0;
 out:
-        kfree(data->client_address);
-        kfree(data->nfs_server.export_path);
-        kfree(data->nfs_server.hostname);
-        kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        kfree(mntfh);
-        kfree(data);
        return error;
 out_free:
@@ -2471,16 +2542,137 @@ error_splat_super:
        goto out;
 }
+static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
+                int flags, void *data, const char *hostname)
+{
+        struct vfsmount *root_mnt;
+        char *root_devname;
+        size_t len;
+        len = strlen(hostname) + 3;
+        root_devname = kmalloc(len, GFP_KERNEL);
+        if (root_devname == NULL)
+                return ERR_PTR(-ENOMEM);
+        snprintf(root_devname, len, "%s:/", hostname);
+        root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data);
+        kfree(root_devname);
+        return root_mnt;
+}
+static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
+{
+        char *page = (char *) __get_free_page(GFP_KERNEL);
+        char *devname, *tmp;
+        if (page == NULL)
+                return;
+        devname = nfs_path(path->mnt->mnt_devname,
+                        path->mnt->mnt_root, path->dentry,
+                        page, PAGE_SIZE);
+        if (devname == NULL)
+                goto out_freepage;
+        tmp = kstrdup(devname, GFP_KERNEL);
+        if (tmp == NULL)
+                goto out_freepage;
+        kfree(mnt->mnt_devname);
+        mnt->mnt_devname = tmp;
+out_freepage:
+        free_page((unsigned long)page);
+}
+static int nfs_follow_remote_path(struct vfsmount *root_mnt,
+                const char *export_path, struct vfsmount *mnt_target)
+{
+        struct mnt_namespace *ns_private;
+        struct nameidata nd;
+        struct super_block *s;
+        int ret;
+        ns_private = create_mnt_ns(root_mnt);
+        ret = PTR_ERR(ns_private);
+        if (IS_ERR(ns_private))
+                goto out_mntput;
+        ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
+                        export_path, LOOKUP_FOLLOW, &nd);
+        put_mnt_ns(ns_private);
+        if (ret != 0)
+                goto out_err;
+        s = nd.path.mnt->mnt_sb;
+        atomic_inc(&s->s_active);
+        mnt_target->mnt_sb = s;
+        mnt_target->mnt_root = dget(nd.path.dentry);
+        /* Correct the device pathname */
+        nfs_fix_devname(&nd.path, mnt_target);
+        path_put(&nd.path);
+        down_write(&s->s_umount);
+        return 0;
+out_mntput:
+        mntput(root_mnt);
+out_err:
+        return ret;
+}
+/*
+ * Get the superblock for an NFS4 mountpoint
+ */
+static int nfs4_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        struct nfs_parsed_mount_data *data;
+        char *export_path;
+        struct vfsmount *root_mnt;
+        int error = -ENOMEM;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                goto out_free_data;
+        /* Validate the mount data */
+        error = nfs4_validate_mount_data(raw_data, data, dev_name);
+        if (error < 0)
+                goto out;
+        export_path = data->nfs_server.export_path;
+        data->nfs_server.export_path = "/";
+        root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+                        data->nfs_server.hostname);
+        data->nfs_server.export_path = export_path;
+        error = PTR_ERR(root_mnt);
+        if (IS_ERR(root_mnt))
+                goto out;
+        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+out:
+        kfree(data->client_address);
+        kfree(data->nfs_server.export_path);
+        kfree(data->nfs_server.hostname);
+        kfree(data->fscache_uniq);
+out_free_data:
+        kfree(data);
+        dprintk("<-- nfs4_get_sb() = %d%s\n", error,
+                        error != 0 ? " [error]" : "");
+        return error;
+}
 static void nfs4_kill_super(struct super_block *sb)
 {
        struct nfs_server *server = NFS_SB(sb);
+        dprintk("--> %s\n", __func__);
        nfs_super_return_all_delegations(sb);
        kill_anon_super(sb);
        nfs4_renewd_prepare_shutdown(server);
        nfs_fscache_release_super_cookie(sb);
        nfs_free_server(server);
+        dprintk("<-- %s\n", __func__);
 }
 /*
@@ -2566,12 +2758,9 @@ error_splat_super:
        return error;
 }
-/*
+static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
- * Create an NFS4 server record on referral traversal
+                int flags, const char *dev_name, void *raw_data,
- */
+                struct vfsmount *mnt)
-static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
-                                const char *dev_name, void *raw_data,
-                                struct vfsmount *mnt)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -2650,4 +2839,36 @@ error_splat_super:
        return error;
 }
+/*
+ * Create an NFS4 server record on referral traversal
+ */
+static int nfs4_referral_get_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data,
+                struct vfsmount *mnt)
+{
+        struct nfs_clone_mount *data = raw_data;
+        char *export_path;
+        struct vfsmount *root_mnt;
+        int error;
+        dprintk("--> nfs4_referral_get_sb()\n");
+        export_path = data->mnt_path;
+        data->mnt_path = "/";
+        root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type,
+                        flags, data, data->hostname);
+        data->mnt_path = export_path;
+        error = PTR_ERR(root_mnt);
+        if (IS_ERR(root_mnt))
+                goto out;
+        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+out:
+        dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error,
+                        error != 0 ? " [error]" : "");
+        return error;
+}
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index ecc295347775..1064c91ae810 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -15,6 +15,7 @@
 #include <linux/wait.h>
 #include "internal.h"
+#include "nfs4_fs.h"
 struct nfs_unlinkdata {
        struct hlist_node list;
@@ -82,7 +83,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
        struct inode *dir = data->dir;
        if (!NFS_PROTO(dir)->unlink_done(task, dir))
-                rpc_restart_call(task);
+                nfs4_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
 }
 /**
@@ -102,9 +103,25 @@ static void nfs_async_unlink_release(void *calldata)
        nfs_sb_deactive(sb);
 }
+#if defined(CONFIG_NFS_V4_1)
+void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs_unlinkdata *data = calldata;
+        struct nfs_server *server = NFS_SERVER(data->dir);
+        if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_unlink_ops = {
        .rpc_call_done = nfs_async_unlink_done,
        .rpc_release = nfs_async_unlink_release,
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_unlink_prepare,
+#endif /* CONFIG_NFS_V4_1 */
 };
 static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
@@ -241,6 +258,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
                status = PTR_ERR(data->cred);
                goto out_free;
        }
+        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        status = -EBUSY;
        spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e560a78995a3..a34fae21fe10 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -25,6 +25,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "nfs4_fs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -52,6 +53,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
+                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        }
        return p;
 }
@@ -71,6 +73,7 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
+                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -84,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
        return p;
 }
-static void nfs_writedata_free(struct nfs_write_data *p)
+void nfs_writedata_free(struct nfs_write_data *p)
 {
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_wdata_mempool);
 }
-void nfs_writedata_release(void *data)
+static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
-        struct nfs_write_data *wdata = data;
        put_nfs_open_context(wdata->args.context);
        nfs_writedata_free(wdata);
 }
@@ -199,8 +200,10 @@ static int nfs_set_page_writeback(struct page *page)
                struct nfs_server *nfss = NFS_SERVER(inode);
                if (atomic_long_inc_return(&nfss->writeback) >
-                                NFS_CONGESTION_ON_THRESH)
+                                NFS_CONGESTION_ON_THRESH) {
-                        set_bdi_congested(&nfss->backing_dev_info, WRITE);
+                        set_bdi_congested(&nfss->backing_dev_info,
+                                                BLK_RW_ASYNC);
+                }
        }
        return ret;
 }
@@ -212,7 +215,7 @@ static void nfs_end_page_writeback(struct page *page)
        end_page_writeback(page);
        if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-                clear_bdi_congested(&nfss->backing_dev_info, WRITE);
+                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
 /*
@@ -1048,7 +1051,23 @@ out:
        nfs_writedata_release(calldata);
 }
+#if defined(CONFIG_NFS_V4_1)
+void nfs_write_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs_write_data *data = calldata;
+        struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client;
+        if (nfs4_setup_sequence(clp, &data->args.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
 static const struct rpc_call_ops nfs_write_partial_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_writeback_done_partial,
        .rpc_release = nfs_writeback_release_partial,
 };
@@ -1111,6 +1130,9 @@ remove_request:
 }
 static const struct rpc_call_ops nfs_write_full_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_writeback_done_full,
        .rpc_release = nfs_writeback_release_full,
 };
@@ -1123,6 +1145,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct nfs_writeargs    *argp = &data->args;
        struct nfs_writeres     *resp = &data->res;
+        struct nfs_server       *server = NFS_SERVER(data->inode);
        int status;
        dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1155,7 +1178,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                if (time_before(complain, jiffies)) {
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
-                                NFS_SERVER(data->inode)->nfs_client->cl_hostname,
+                                server->nfs_client->cl_hostname,
                                resp->verf->committed, argp->stable);
                        complain = jiffies + 300 * HZ;
                }
@@ -1181,7 +1204,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                                 */
                                argp->stable = NFS_FILE_SYNC;
                        }
-                        rpc_restart_call(task);
+                        nfs4_restart_rpc(task, server->nfs_client);
                        return -EAGAIN;
                }
                if (time_before(complain, jiffies)) {
@@ -1193,6 +1216,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                /* Can't do anything about it except throw an error. */
                task->tk_status = -EIO;
        }
+        nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
        return 0;
 }
@@ -1349,6 +1373,9 @@ static void nfs_commit_release(void *calldata)
 }
 static const struct rpc_call_ops nfs_commit_ops = {
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
        .rpc_call_done = nfs_commit_done,
        .rpc_release = nfs_commit_release,
 };
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5839b229cd0e..b92a27629fb7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -464,16 +464,11 @@ static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
                if (err)
                        return err;
                /*
-                 * Just a quick sanity check; we could also try to check
+                 * XXX: It would be nice to also check whether this
-                 * whether this pseudoflavor is supported, but at worst
+                 * pseudoflavor is supported, so we can discover the
-                 * an unsupported pseudoflavor on the export would just
+                 * problem at export time instead of when a client fails
-                 * be a pseudoflavor that won't match the flavor of any
+                 * to authenticate.
-                 * authenticated request.  The administrator will
-                 * probably discover the problem when someone fails to
-                 * authenticate.
                 */
-                if (f->pseudoflavor < 0)
-                        return -EINVAL;
                err = get_int(mesg, &f->flags);
                if (err)
                        return err;
@@ -847,9 +842,8 @@ exp_get_fsid_key(svc_client *clp, int fsid)
        return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
-static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
+static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
-                                   struct dentry *dentry,
+                                     struct cache_req *reqp)
-                                   struct cache_req *reqp)
 {
        struct svc_export *exp, key;
        int err;
@@ -858,8 +852,7 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
                return ERR_PTR(-ENOENT);
        key.ex_client = clp;
-        key.ex_path.mnt = mnt;
+        key.ex_path = *path;
-        key.ex_path.dentry = dentry;
        exp = svc_export_lookup(&key);
        if (exp == NULL)
@@ -873,24 +866,19 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
 /*
 * Find the export entry for a given dentry.
 */
-static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt,
+static struct svc_export *exp_parent(svc_client *clp, struct path *path)
-                                     struct dentry *dentry,
-                                     struct cache_req *reqp)
 {
-        svc_export *exp;
+        struct dentry *saved = dget(path->dentry);
+        svc_export *exp = exp_get_by_name(clp, path, NULL);
-        dget(dentry);
-        exp = exp_get_by_name(clp, mnt, dentry, reqp);
+        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+                struct dentry *parent = dget_parent(path->dentry);
-        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
+                dput(path->dentry);
-                struct dentry *parent;
+                path->dentry = parent;
+                exp = exp_get_by_name(clp, path, NULL);
-                parent = dget_parent(dentry);
-                dput(dentry);
-                dentry = parent;
-                exp = exp_get_by_name(clp, mnt, dentry, reqp);
        }
-        dput(dentry);
+        dput(path->dentry);
+        path->dentry = saved;
        return exp;
 }
@@ -1018,7 +1006,7 @@ exp_export(struct nfsctl_export *nxp)
                goto out_put_clp;
        err = -EINVAL;
-        exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL);
+        exp = exp_get_by_name(clp, &path, NULL);
        memset(&new, 0, sizeof(new));
@@ -1135,7 +1123,7 @@ exp_unexport(struct nfsctl_export *nxp)
                goto out_domain;
        err = -EINVAL;
-        exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL);
+        exp = exp_get_by_name(dom, &path, NULL);
        path_put(&path);
        if (IS_ERR(exp))
                goto out_domain;
@@ -1177,7 +1165,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
        dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
                 name, path.dentry, clp->name,
                 inode->i_sb->s_id, inode->i_ino);
-        exp = exp_parent(clp, path.mnt, path.dentry, NULL);
+        exp = exp_parent(clp, &path);
        if (IS_ERR(exp)) {
                err = PTR_ERR(exp);
                goto out;
@@ -1207,7 +1195,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type,
        if (IS_ERR(ek))
                return ERR_CAST(ek);
-        exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp);
+        exp = exp_get_by_name(clp, &ek->ek_path, reqp);
        cache_put(&ek->h, &svc_expkey_cache);
        if (IS_ERR(exp))
@@ -1247,8 +1235,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
 * use exp_get_by_name() or exp_find().
 */
 struct svc_export *
-rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
-                struct dentry *dentry)
 {
        struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
@@ -1256,8 +1243,7 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
                goto gss;
        /* First try the auth_unix client: */
-        exp = exp_get_by_name(rqstp->rq_client, mnt, dentry,
+        exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
-                                                &rqstp->rq_chandle);
        if (PTR_ERR(exp) == -ENOENT)
                goto gss;
        if (IS_ERR(exp))
@@ -1269,8 +1255,7 @@ gss:
        /* Otherwise, try falling back on gss client */
        if (rqstp->rq_gssclient == NULL)
                return exp;
-        gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry,
+        gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
-                                                &rqstp->rq_chandle);
        if (PTR_ERR(gssexp) == -ENOENT)
                return exp;
        if (!IS_ERR(exp))
@@ -1309,23 +1294,19 @@ gss:
 }
 struct svc_export *
-rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
+rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
-                struct dentry *dentry)
 {
-        struct svc_export *exp;
+        struct dentry *saved = dget(path->dentry);
+        struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
-        dget(dentry);
-        exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+                struct dentry *parent = dget_parent(path->dentry);
-        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
+                dput(path->dentry);
-                struct dentry *parent;
+                path->dentry = parent;
+                exp = rqst_exp_get_by_name(rqstp, path);
-                parent = dget_parent(dentry);
-                dput(dentry);
-                dentry = parent;
-                exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
        }
-        dput(dentry);
+        dput(path->dentry);
+        path->dentry = saved;
        return exp;
 }
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7c9fe838f038..a713c418a922 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -652,8 +652,6 @@ nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp,
 * NFSv3 Server procedures.
 * Only the results of non-idempotent operations are cached.
 */
-#define nfs3svc_decode_voidargs         NULL
-#define nfs3svc_release_void            NULL
 #define nfs3svc_decode_fhandleargs      nfs3svc_decode_fhandle
 #define nfs3svc_encode_attrstatres      nfs3svc_encode_attrstat
 #define nfs3svc_encode_wccstatres       nfs3svc_encode_wccstat
@@ -686,28 +684,219 @@ struct nfsd3_voidargs { int dummy; };
 #define WC (7+pAT)      /* WCC attributes */
 static struct svc_procedure             nfsd_procedures3[22] = {
-  PROC(null,     void,          void,           void,     RC_NOCACHE, ST),
+        [NFS3PROC_NULL] = {
-  PROC(getattr,  fhandle,       attrstat,       fhandle,  RC_NOCACHE, ST+AT),
+                .pc_func = (svc_procfunc) nfsd3_proc_null,
-  PROC(setattr,  sattr,         wccstat,        fhandle,  RC_REPLBUFF, ST+WC),
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_voidres,
-  PROC(lookup,   dirop,         dirop,          fhandle2, RC_NOCACHE, ST+FH+pAT+pAT),
+                .pc_argsize = sizeof(struct nfsd3_voidargs),
-  PROC(access,   access,        access,         fhandle,  RC_NOCACHE, ST+pAT+1),
+                .pc_ressize = sizeof(struct nfsd3_voidres),
-  PROC(readlink, readlink,      readlink,       fhandle,  RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4),
+                .pc_cachetype = RC_NOCACHE,
-  PROC(read,     read,          read,           fhandle,  RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4),
+                .pc_xdrressize = ST,
-  PROC(write,    write,         write,          fhandle,  RC_REPLBUFF, ST+WC+4),
+        },
-  PROC(create,   create,        create,         fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+        [NFS3PROC_GETATTR] = {
-  PROC(mkdir,    mkdir,         create,         fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+                .pc_func = (svc_procfunc) nfsd3_proc_getattr,
-  PROC(symlink,  symlink,       create,         fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
-  PROC(mknod,    mknod,         create,         fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_attrstatres,
-  PROC(remove,   dirop,         wccstat,        fhandle,  RC_REPLBUFF, ST+WC),
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
-  PROC(rmdir,    dirop,         wccstat,        fhandle,  RC_REPLBUFF, ST+WC),
+                .pc_argsize = sizeof(struct nfsd3_fhandleargs),
-  PROC(rename,   rename,        rename,         fhandle2, RC_REPLBUFF, ST+WC+WC),
+                .pc_ressize = sizeof(struct nfsd3_attrstatres),
-  PROC(link,     link,          link,           fhandle2, RC_REPLBUFF, ST+pAT+WC),
+                .pc_cachetype = RC_NOCACHE,
-  PROC(readdir,  readdir,       readdir,        fhandle,  RC_NOCACHE, 0),
+                .pc_xdrressize = ST+AT,
-  PROC(readdirplus,readdirplus, readdir,        fhandle,  RC_NOCACHE, 0),
+        },
-  PROC(fsstat,   fhandle,       fsstat,         void,     RC_NOCACHE, ST+pAT+2*6+1),
+        [NFS3PROC_SETATTR] = {
-  PROC(fsinfo,   fhandle,       fsinfo,         void,     RC_NOCACHE, ST+pAT+12),
+                .pc_func = (svc_procfunc) nfsd3_proc_setattr,
-  PROC(pathconf, fhandle,       pathconf,       void,     RC_NOCACHE, ST+pAT+6),
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_sattrargs,
-  PROC(commit,   commit,        commit,         fhandle,  RC_NOCACHE, ST+WC+2),
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_sattrargs),
+                .pc_ressize = sizeof(struct nfsd3_wccstatres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC,
+        },
+        [NFS3PROC_LOOKUP] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_lookup,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_diropres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_diropargs),
+                .pc_ressize = sizeof(struct nfsd3_diropres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+FH+pAT+pAT,
+        },
+        [NFS3PROC_ACCESS] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_access,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_accessargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_accessres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_accessargs),
+                .pc_ressize = sizeof(struct nfsd3_accessres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+1,
+        },
+        [NFS3PROC_READLINK] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_readlink,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_readlinkargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_readlinkres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_readlinkargs),
+                .pc_ressize = sizeof(struct nfsd3_readlinkres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
+        },
+        [NFS3PROC_READ] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_read,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_readargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_readres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_readargs),
+                .pc_ressize = sizeof(struct nfsd3_readres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
+        },
+        [NFS3PROC_WRITE] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_write,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_writeargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_writeres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_writeargs),
+                .pc_ressize = sizeof(struct nfsd3_writeres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC+4,
+        },
+        [NFS3PROC_CREATE] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_create,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_createargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_createargs),
+                .pc_ressize = sizeof(struct nfsd3_createres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+(1+FH+pAT)+WC,
+        },
+        [NFS3PROC_MKDIR] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_mkdir,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_mkdirargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_mkdirargs),
+                .pc_ressize = sizeof(struct nfsd3_createres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+(1+FH+pAT)+WC,
+        },
+        [NFS3PROC_SYMLINK] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_symlink,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_symlinkargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_symlinkargs),
+                .pc_ressize = sizeof(struct nfsd3_createres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+(1+FH+pAT)+WC,
+        },
+        [NFS3PROC_MKNOD] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_mknod,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_mknodargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_createres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_mknodargs),
+                .pc_ressize = sizeof(struct nfsd3_createres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+(1+FH+pAT)+WC,
+        },
+        [NFS3PROC_REMOVE] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_remove,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_diropargs),
+                .pc_ressize = sizeof(struct nfsd3_wccstatres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC,
+        },
+        [NFS3PROC_RMDIR] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_rmdir,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_diropargs),
+                .pc_ressize = sizeof(struct nfsd3_wccstatres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC,
+        },
+        [NFS3PROC_RENAME] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_rename,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_renameargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_renameres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_renameargs),
+                .pc_ressize = sizeof(struct nfsd3_renameres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+WC+WC,
+        },
+        [NFS3PROC_LINK] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_link,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_linkargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_linkres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2,
+                .pc_argsize = sizeof(struct nfsd3_linkargs),
+                .pc_ressize = sizeof(struct nfsd3_linkres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+pAT+WC,
+        },
+        [NFS3PROC_READDIR] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_readdir,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_readdirargs),
+                .pc_ressize = sizeof(struct nfsd3_readdirres),
+                .pc_cachetype = RC_NOCACHE,
+        },
+        [NFS3PROC_READDIRPLUS] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_readdirplus,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirplusargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_readdirplusargs),
+                .pc_ressize = sizeof(struct nfsd3_readdirres),
+                .pc_cachetype = RC_NOCACHE,
+        },
+        [NFS3PROC_FSSTAT] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_fsstat,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_fsstatres,
+                .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+                .pc_ressize = sizeof(struct nfsd3_fsstatres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+2*6+1,
+        },
+        [NFS3PROC_FSINFO] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_fsinfo,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_fsinfores,
+                .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+                .pc_ressize = sizeof(struct nfsd3_fsinfores),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+12,
+        },
+        [NFS3PROC_PATHCONF] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_pathconf,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_pathconfres,
+                .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+                .pc_ressize = sizeof(struct nfsd3_pathconfres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+pAT+6,
+        },
+        [NFS3PROC_COMMIT] = {
+                .pc_func = (svc_procfunc) nfsd3_proc_commit,
+                .pc_decode = (kxdrproc_t) nfs3svc_decode_commitargs,
+                .pc_encode = (kxdrproc_t) nfs3svc_encode_commitres,
+                .pc_release = (kxdrproc_t) nfs3svc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd3_commitargs),
+                .pc_ressize = sizeof(struct nfsd3_commitres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+WC+2,
+        },
 };
 struct svc_version      nfsd_version3 = {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 17d0dd997204..01d4ec1c88e0 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -272,6 +272,7 @@ void fill_post_wcc(struct svc_fh *fhp)
        err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
                        &fhp->fh_post_attr);
+        fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
        if (err)
                fhp->fh_post_saved = 0;
        else
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 290289bd44f7..3fd23f7aceca 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -140,8 +140,10 @@ struct nfs4_cb_compound_hdr {
        int             status;
        u32             ident;
        u32             nops;
+        __be32          *nops_p;
+        u32             minorversion;
        u32             taglen;
-        char *          tag;
+        char            *tag;
 };
 static struct {
@@ -201,33 +203,39 @@ nfs_cb_stat_to_errno(int stat)
 * XDR encode
 */
-static int
+static void
 encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 {
        __be32 * p;
        RESERVE_SPACE(16);
        WRITE32(0);            /* tag length is always 0 */
-        WRITE32(NFS4_MINOR_VERSION);
+        WRITE32(hdr->minorversion);
        WRITE32(hdr->ident);
+        hdr->nops_p = p;
        WRITE32(hdr->nops);
-        return 0;
 }
-static int
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
+{
+        *hdr->nops_p = htonl(hdr->nops);
+}
+static void
+encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+                struct nfs4_cb_compound_hdr *hdr)
 {
        __be32 *p;
-        int len = cb_rec->cbr_fh.fh_size;
+        int len = dp->dl_fh.fh_size;
-        RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
+        RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len);
        WRITE32(OP_CB_RECALL);
-        WRITE32(cb_rec->cbr_stateid.si_generation);
+        WRITE32(dp->dl_stateid.si_generation);
-        WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
+        WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
-        WRITE32(cb_rec->cbr_trunc);
+        WRITE32(0); /* truncate optimization not implemented */
        WRITE32(len);
-        WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
+        WRITEMEM(&dp->dl_fh.fh_base, len);
-        return 0;
+        hdr->nops++;
 }
 static int
@@ -241,17 +249,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 }
 static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args)
 {
        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr = {
-                .ident = args->cbr_ident,
+                .ident = args->dl_ident,
-                .nops   = 1,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_cb_compound_hdr(&xdr, &hdr);
-        return (encode_cb_recall(&xdr, args));
+        encode_cb_recall(&xdr, args, &hdr);
+        encode_cb_nops(&hdr);
+        return 0;
 }
@@ -358,18 +367,21 @@ static struct rpc_program cb_program = {
                .pipe_dir_name  = "/nfsd4_cb",
 };
+static int max_cb_time(void)
+{
+        return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ;
+}
 /* Reference counting, callback cleanup, etc., all look racy as heck.
 * And why is cb_set an atomic? */
-static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
+int setup_callback_client(struct nfs4_client *clp)
 {
        struct sockaddr_in      addr;
-        struct nfs4_callback    *cb = &clp->cl_callback;
+        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        struct rpc_timeout      timeparms = {
-                .to_initval     = (NFSD_LEASE_TIME/4) * HZ,
+                .to_initval     = max_cb_time(),
-                .to_retries     = 5,
+                .to_retries     = 0,
-                .to_maxval      = (NFSD_LEASE_TIME/2) * HZ,
-                .to_exponential = 1,
        };
        struct rpc_create_args args = {
                .protocol       = IPPROTO_TCP,
@@ -386,7 +398,7 @@ static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
        struct rpc_clnt *client;
        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
@@ -396,48 +408,77 @@ static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
        /* Create RPC client */
        client = rpc_create(&args);
-        if (IS_ERR(client))
+        if (IS_ERR(client)) {
                dprintk("NFSD: couldn't create callback client: %ld\n",
                        PTR_ERR(client));
-        return client;
+                return PTR_ERR(client);
+        }
+        cb->cb_client = client;
+        return 0;
+}
+static void warn_no_callback_path(struct nfs4_client *clp, int reason)
+{
+        dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
+                (int)clp->cl_name.len, clp->cl_name.data, reason);
+}
+static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_client *clp = calldata;
+        if (task->tk_status)
+                warn_no_callback_path(clp, task->tk_status);
+        else
+                atomic_set(&clp->cl_cb_conn.cb_set, 1);
+        put_nfs4_client(clp);
+}
+static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+        .rpc_call_done = nfsd4_cb_probe_done,
+};
+static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb)
+{
+        struct auth_cred acred = {
+                .machine_cred = 1
+        };
+        /*
+         * Note in the gss case this doesn't actually have to wait for a
+         * gss upcall (or any calls to the client); this just creates a
+         * non-uptodate cred which the rpc state machine will fill in with
+         * a refresh_upcall later.
+         */
+        return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
+                                                        RPCAUTH_LOOKUP_NEW);
 }
-static int do_probe_callback(void *data)
+void do_probe_callback(struct nfs4_client *clp)
 {
-        struct nfs4_client *clp = data;
+        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
-        struct nfs4_callback    *cb = &clp->cl_callback;
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
                .rpc_argp       = clp,
        };
-        struct rpc_clnt *client;
+        struct rpc_cred *cred;
        int status;
-        client = setup_callback_client(clp);
+        cred = lookup_cb_cred(cb);
-        if (IS_ERR(client)) {
+        if (IS_ERR(cred)) {
-                status = PTR_ERR(client);
+                status = PTR_ERR(cred);
-                dprintk("NFSD: couldn't create callback client: %d\n",
+                goto out;
-                                                                status);
+        }
-                goto out_err;
+        cb->cb_cred = cred;
+        msg.rpc_cred = cb->cb_cred;
+        status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
+                                &nfsd4_cb_probe_ops, (void *)clp);
+out:
+        if (status) {
+                warn_no_callback_path(clp, status);
+                put_nfs4_client(clp);
        }
-        status = rpc_call_sync(client, &msg, RPC_TASK_SOFT);
-        if (status)
-                goto out_release_client;
-        cb->cb_client = client;
-        atomic_set(&cb->cb_set, 1);
-        put_nfs4_client(clp);
-        return 0;
-out_release_client:
-        rpc_shutdown_client(client);
-out_err:
-        dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
-                (int)clp->cl_name.len, clp->cl_name.data, status);
-        put_nfs4_client(clp);
-        return 0;
 }
 /*
@@ -446,21 +487,65 @@ out_err:
 void
 nfsd4_probe_callback(struct nfs4_client *clp)
 {
-        struct task_struct *t;
+        int status;
-        BUG_ON(atomic_read(&clp->cl_callback.cb_set));
+        BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
+        status = setup_callback_client(clp);
+        if (status) {
+                warn_no_callback_path(clp, status);
+                return;
+        }
        /* the task holds a reference to the nfs4_client struct */
        atomic_inc(&clp->cl_count);
-        t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe");
+        do_probe_callback(clp);
+}
-        if (IS_ERR(t))
+static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
-                atomic_dec(&clp->cl_count);
+{
+        struct nfs4_delegation *dp = calldata;
+        struct nfs4_client *clp = dp->dl_client;
-        return;
+        switch (task->tk_status) {
+        case -EIO:
+                /* Network partition? */
+                atomic_set(&clp->cl_cb_conn.cb_set, 0);
+                warn_no_callback_path(clp, task->tk_status);
+        case -EBADHANDLE:
+        case -NFS4ERR_BAD_STATEID:
+                /* Race: client probably got cb_recall
+                 * before open reply granting delegation */
+                break;
+        default:
+                /* success, or error we can't handle */
+                return;
+        }
+        if (dp->dl_retries--) {
+                rpc_delay(task, 2*HZ);
+                task->tk_status = 0;
+                rpc_restart_call(task);
+        } else {
+                atomic_set(&clp->cl_cb_conn.cb_set, 0);
+                warn_no_callback_path(clp, task->tk_status);
+        }
+}
+static void nfsd4_cb_recall_release(void *calldata)
+{
+        struct nfs4_delegation *dp = calldata;
+        struct nfs4_client *clp = dp->dl_client;
+        nfs4_put_delegation(dp);
+        put_nfs4_client(clp);
 }
+static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+        .rpc_call_done = nfsd4_cb_recall_done,
+        .rpc_release = nfsd4_cb_recall_release,
+};
 /*
 * called with dp->dl_count inc'ed.
 */
@@ -468,41 +553,19 @@ void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
        struct nfs4_client *clp = dp->dl_client;
-        struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-        struct nfs4_cb_recall *cbr = &dp->dl_recall;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-                .rpc_argp = cbr,
+                .rpc_argp = dp,
+                .rpc_cred = clp->cl_cb_conn.cb_cred
        };
-        int retries = 1;
+        int status;
-        int status = 0;
+        dp->dl_retries = 1;
-        cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
+        status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
-        cbr->cbr_dp = dp;
+                                &nfsd4_cb_recall_ops, dp);
+        if (status) {
-        status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
+                put_nfs4_client(clp);
-        while (retries--) {
+                nfs4_put_delegation(dp);
-                switch (status) {
-                        case -EIO:
-                                /* Network partition? */
-                                atomic_set(&clp->cl_callback.cb_set, 0);
-                        case -EBADHANDLE:
-                        case -NFS4ERR_BAD_STATEID:
-                                /* Race: client probably got cb_recall
-                                 * before open reply granting delegation */
-                                break;
-                        default:
-                                goto out_put_cred;
-                }
-                ssleep(2);
-                status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
        }
-out_put_cred:
-        /*
-         * Success or failure, now we're either waiting for lease expiration
-         * or deleg_return.
-         */
-        put_nfs4_client(clp);
-        nfs4_put_delegation(dp);
-        return;
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b2883e9c6381..7c8801769a3c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -51,6 +51,78 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
+static u32 nfsd_attrmask[] = {
+        NFSD_WRITEABLE_ATTRS_WORD0,
+        NFSD_WRITEABLE_ATTRS_WORD1,
+        NFSD_WRITEABLE_ATTRS_WORD2
+};
+static u32 nfsd41_ex_attrmask[] = {
+        NFSD_SUPPATTR_EXCLCREAT_WORD0,
+        NFSD_SUPPATTR_EXCLCREAT_WORD1,
+        NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+static __be32
+check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+                   u32 *bmval, u32 *writable)
+{
+        struct dentry *dentry = cstate->current_fh.fh_dentry;
+        struct svc_export *exp = cstate->current_fh.fh_export;
+        /*
+         * Check about attributes are supported by the NFSv4 server or not.
+         * According to spec, unsupported attributes return ERR_ATTRNOTSUPP.
+         */
+        if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) ||
+            (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) ||
+            (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
+                return nfserr_attrnotsupp;
+        /*
+         * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported
+         * in current environment or not.
+         */
+        if (bmval[0] & FATTR4_WORD0_ACL) {
+                if (!IS_POSIXACL(dentry->d_inode))
+                        return nfserr_attrnotsupp;
+        }
+        if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) {
+                if (exp->ex_fslocs.locations == NULL)
+                        return nfserr_attrnotsupp;
+        }
+        /*
+         * According to spec, read-only attributes return ERR_INVAL.
+         */
+        if (writable) {
+                if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+                    (bmval[2] & ~writable[2]))
+                        return nfserr_inval;
+        }
+        return nfs_ok;
+}
+static __be32
+nfsd4_check_open_attributes(struct svc_rqst *rqstp,
+        struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+        __be32 status = nfs_ok;
+        if (open->op_create == NFS4_OPEN_CREATE) {
+                if (open->op_createmode == NFS4_CREATE_UNCHECKED
+                    || open->op_createmode == NFS4_CREATE_GUARDED)
+                        status = check_attr_support(rqstp, cstate,
+                                        open->op_bmval, nfsd_attrmask);
+                else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1)
+                        status = check_attr_support(rqstp, cstate,
+                                        open->op_bmval, nfsd41_ex_attrmask);
+        }
+        return status;
+}
 static inline void
 fh_dup2(struct svc_fh *dst, struct svc_fh *src)
 {
@@ -225,6 +297,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                goto out;
+        status = nfsd4_check_open_attributes(rqstp, cstate, open);
+        if (status)
+                goto out;
        /* Openowner is now set, so sequence id will get bumped.  Now we need
         * these checks before we do any creates: */
        status = nfserr_grace;
@@ -395,6 +471,11 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
+        status = check_attr_support(rqstp, cstate, create->cr_bmval,
+                                    nfsd_attrmask);
+        if (status)
+                return status;
        switch (create->cr_type) {
        case NF4LNK:
                /* ugh! we have to null-terminate the linktext, or
@@ -689,6 +770,12 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
        status = nfs_ok;
+        status = check_attr_support(rqstp, cstate, setattr->sa_bmval,
+                                    nfsd_attrmask);
+        if (status)
+                goto out;
        if (setattr->sa_acl != NULL)
                status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
                                            setattr->sa_acl);
@@ -763,10 +850,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
-        if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
+        status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL);
-            || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+        if (status)
-            || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
+                return status;
-                return nfserr_attrnotsupp;
        if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
            || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
                return nfserr_inval;
@@ -1226,24 +1313,9 @@ static const char *nfsd4_op_name(unsigned opnum)
        return "unknown_operation";
 }
-#define nfs4svc_decode_voidargs         NULL
-#define nfs4svc_release_void            NULL
 #define nfsd4_voidres                   nfsd4_voidargs
-#define nfs4svc_release_compound        NULL
 struct nfsd4_voidargs { int dummy; };
-#define PROC(name, argt, rest, relt, cache, respsize)   \
- { (svc_procfunc) nfsd4_proc_##name,            \
-   (kxdrproc_t) nfs4svc_decode_##argt##args,    \
-   (kxdrproc_t) nfs4svc_encode_##rest##res,     \
-   (kxdrproc_t) nfs4svc_release_##relt,         \
-   sizeof(struct nfsd4_##argt##args),           \
-   sizeof(struct nfsd4_##rest##res),            \
-   0,                                           \
-   cache,                                       \
-   respsize,                                    \
- }
 /*
 * TODO: At the present time, the NFSv4 server does not do XID caching
 * of requests.  Implementing XID caching would not be a serious problem,
@@ -1255,8 +1327,23 @@ struct nfsd4_voidargs { int dummy; };
 * better XID's.
 */
 static struct svc_procedure             nfsd_procedures4[2] = {
-  PROC(null,     void,          void,           void,     RC_NOCACHE, 1),
+        [NFSPROC4_NULL] = {
-  PROC(compound, compound,      compound,       compound, RC_NOCACHE, NFSD_BUFSIZE/4)
+                .pc_func = (svc_procfunc) nfsd4_proc_null,
+                .pc_encode = (kxdrproc_t) nfs4svc_encode_voidres,
+                .pc_argsize = sizeof(struct nfsd4_voidargs),
+                .pc_ressize = sizeof(struct nfsd4_voidres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = 1,
+        },
+        [NFSPROC4_COMPOUND] = {
+                .pc_func = (svc_procfunc) nfsd4_proc_compound,
+                .pc_decode = (kxdrproc_t) nfs4svc_decode_compoundargs,
+                .pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres,
+                .pc_argsize = sizeof(struct nfsd4_compoundargs),
+                .pc_ressize = sizeof(struct nfsd4_compoundres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = NFSD_BUFSIZE/4,
+        },
 };
 struct svc_version      nfsd_version4 = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3b711f5147a7..980a216a48c8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -182,7 +182,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 {
        struct nfs4_delegation *dp;
        struct nfs4_file *fp = stp->st_file;
-        struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
+        struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
        dprintk("NFSD alloc_init_deleg\n");
        if (fp->fi_had_conflict)
@@ -203,10 +203,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        get_file(stp->st_vfs_file);
        dp->dl_vfs_file = stp->st_vfs_file;
        dp->dl_type = type;
-        dp->dl_recall.cbr_dp = NULL;
+        dp->dl_ident = cb->cb_ident;
-        dp->dl_recall.cbr_ident = cb->cb_ident;
+        dp->dl_stateid.si_boot = get_seconds();
-        dp->dl_recall.cbr_trunc = 0;
-        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
        dp->dl_stateid.si_generation = 0;
@@ -427,6 +425,11 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
 {
        int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+        if (fchan->maxreqs < 1)
+                return nfserr_inval;
+        else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+                fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
        spin_lock(&nfsd_serv->sv_lock);
        if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
                np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
@@ -446,8 +449,8 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
 * fchan holds the client values on input, and the server values on output
 */
 static int init_forechannel_attrs(struct svc_rqst *rqstp,
-                                    struct nfsd4_session *session,
+                                  struct nfsd4_channel_attrs *session_fchan,
-                                    struct nfsd4_channel_attrs *fchan)
+                                  struct nfsd4_channel_attrs *fchan)
 {
        int status = 0;
        __u32   maxcount = svc_max_payload(rqstp);
@@ -457,21 +460,21 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
        /* Use the client's max request and max response size if possible */
        if (fchan->maxreq_sz > maxcount)
                fchan->maxreq_sz = maxcount;
-        session->se_fmaxreq_sz = fchan->maxreq_sz;
+        session_fchan->maxreq_sz = fchan->maxreq_sz;
        if (fchan->maxresp_sz > maxcount)
                fchan->maxresp_sz = maxcount;
-        session->se_fmaxresp_sz = fchan->maxresp_sz;
+        session_fchan->maxresp_sz = fchan->maxresp_sz;
        /* Set the max response cached size our default which is
         * a multiple of PAGE_SIZE and small */
-        session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+        session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
-        fchan->maxresp_cached = session->se_fmaxresp_cached;
+        fchan->maxresp_cached = session_fchan->maxresp_cached;
        /* Use the client's maxops if possible */
        if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
                fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
-        session->se_fmaxops = fchan->maxops;
+        session_fchan->maxops = fchan->maxops;
        /* try to use the client requested number of slots */
        if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
@@ -483,7 +486,7 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
         */
        status = set_forechannel_maxreqs(fchan);
-        session->se_fnumslots = fchan->maxreqs;
+        session_fchan->maxreqs = fchan->maxreqs;
        return status;
 }
@@ -497,12 +500,14 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
        memset(&tmp, 0, sizeof(tmp));
        /* FIXME: For now, we just accept the client back channel attributes. */
-        status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+        tmp.se_bchannel = cses->back_channel;
+        status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
+                                        &cses->fore_channel);
        if (status)
                goto out;
        /* allocate struct nfsd4_session and slot table in one piece */
-        slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+        slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot);
        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
        if (!new)
                goto out;
@@ -576,7 +581,7 @@ free_session(struct kref *kref)
        int i;
        ses = container_of(kref, struct nfsd4_session, se_ref);
-        for (i = 0; i < ses->se_fnumslots; i++) {
+        for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
                struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
                nfsd4_release_respages(e->ce_respages, e->ce_resused);
        }
@@ -632,16 +637,20 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static void
 shutdown_callback_client(struct nfs4_client *clp)
 {
-        struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
        if (clnt) {
                /*
                 * Callback threads take a reference on the client, so there
                 * should be no outstanding callbacks at this point.
                 */
-                clp->cl_callback.cb_client = NULL;
+                clp->cl_cb_conn.cb_client = NULL;
                rpc_shutdown_client(clnt);
        }
+        if (clp->cl_cb_conn.cb_cred) {
+                put_rpccred(clp->cl_cb_conn.cb_cred);
+                clp->cl_cb_conn.cb_cred = NULL;
+        }
 }
 static inline void
@@ -714,7 +723,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
                return NULL;
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_count, 1);
-        atomic_set(&clp->cl_callback.cb_set, 0);
+        atomic_set(&clp->cl_cb_conn.cb_set, 0);
        INIT_LIST_HEAD(&clp->cl_idhash);
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
@@ -966,7 +975,7 @@ parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigne
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 {
-        struct nfs4_callback *cb = &clp->cl_callback;
+        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        /* Currently, we only support tcp for the callback channel */
        if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
@@ -975,6 +984,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
        if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
                         &cb->cb_addr, &cb->cb_port)))
                goto out_err;
+        cb->cb_minorversion = 0;
        cb->cb_prog = se->se_callback_prog;
        cb->cb_ident = se->se_callback_ident;
        return;
@@ -1128,7 +1138,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
         * is sent (lease renewal).
         */
        if (seq && nfsd4_not_cached(resp)) {
-                seq->maxslots = resp->cstate.session->se_fnumslots;
+                seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
                return nfs_ok;
        }
@@ -1238,12 +1248,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                        expire_client(conf);
                        goto out_new;
                }
-                if (ip_addr != conf->cl_addr &&
-                    !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
-                        /* Client collision. 18.35.4 case 3 */
-                        status = nfserr_clid_inuse;
-                        goto out;
-                }
                /*
                 * Set bit when the owner id and verifier map to an already
                 * confirmed client id (18.35.3).
@@ -1257,12 +1261,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                copy_verf(conf, &verf);
                new = conf;
                goto out_copy;
-        } else {
+        }
-                /* 18.35.4 case 7 */
-                if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+        /* 18.35.4 case 7 */
-                        status = nfserr_noent;
+        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
-                        goto out;
+                status = nfserr_noent;
-                }
+                goto out;
        }
        unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
@@ -1471,7 +1475,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
                goto out;
        status = nfserr_badslot;
-        if (seq->slotid >= session->se_fnumslots)
+        if (seq->slotid >= session->se_fchannel.maxreqs)
                goto out;
        slot = &session->se_slots[seq->slotid];
@@ -1686,9 +1690,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                else {
                        /* XXX: We just turn off callbacks until we can handle
                          * change request correctly. */
-                        atomic_set(&conf->cl_callback.cb_set, 0);
+                        atomic_set(&conf->cl_cb_conn.cb_set, 0);
-                        gen_confirm(conf);
-                        nfsd4_remove_clid_dir(unconf);
                        expire_client(unconf);
                        status = nfs_ok;
@@ -1882,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = boot_time;
+        stp->st_stateid.si_boot = get_seconds();
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
@@ -2059,19 +2061,6 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
 }
 /*
- * Recall a delegation
- */
-static int
-do_recall(void *__dp)
-{
-        struct nfs4_delegation *dp = __dp;
-        dp->dl_file->fi_had_conflict = true;
-        nfsd4_cb_recall(dp);
-        return 0;
-}
-/*
 * Spawn a thread to perform a recall on the delegation represented
 * by the lease (file_lock)
 *
@@ -2082,8 +2071,7 @@ do_recall(void *__dp)
 static
 void nfsd_break_deleg_cb(struct file_lock *fl)
 {
-        struct nfs4_delegation *dp=  (struct nfs4_delegation *)fl->fl_owner;
+        struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-        struct task_struct *t;
        dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
        if (!dp)
@@ -2111,16 +2099,8 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
         */
        fl->fl_break_time = 0;
-        t = kthread_run(do_recall, dp, "%s", "nfs4_cb_recall");
+        dp->dl_file->fi_had_conflict = true;
-        if (IS_ERR(t)) {
+        nfsd4_cb_recall(dp);
-                struct nfs4_client *clp = dp->dl_client;
-                printk(KERN_INFO "NFSD: Callback thread failed for "
-                        "for client (clientid %08x/%08x)\n",
-                        clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
-                put_nfs4_client(dp->dl_client);
-                nfs4_put_delegation(dp);
-        }
 }
 /*
@@ -2422,7 +2402,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
-        struct nfs4_callback *cb = &sop->so_client->cl_callback;
+        struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
        struct file_lock fl, *flp = &fl;
        int status, flag = 0;
@@ -2614,7 +2594,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        renew_client(clp);
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
-                        && !atomic_read(&clp->cl_callback.cb_set))
+                        && !atomic_read(&clp->cl_cb_conn.cb_set))
                goto out;
        status = nfs_ok;
 out:
@@ -2738,12 +2718,42 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
 static int
 STALE_STATEID(stateid_t *stateid)
 {
-        if (stateid->si_boot == boot_time)
+        if (time_after((unsigned long)boot_time,
-                return 0;
+                        (unsigned long)stateid->si_boot)) {
-        dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n",
+                dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n",
-                stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid,
+                        stateid->si_boot, stateid->si_stateownerid,
-                stateid->si_generation);
+                        stateid->si_fileid, stateid->si_generation);
-        return 1;
+                return 1;
+        }
+        return 0;
+}
+static int
+EXPIRED_STATEID(stateid_t *stateid)
+{
+        if (time_before((unsigned long)boot_time,
+                        ((unsigned long)stateid->si_boot)) &&
+            time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
+                dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n",
+                        stateid->si_boot, stateid->si_stateownerid,
+                        stateid->si_fileid, stateid->si_generation);
+                return 1;
+        }
+        return 0;
+}
+static __be32
+stateid_error_map(stateid_t *stateid)
+{
+        if (STALE_STATEID(stateid))
+                return nfserr_stale_stateid;
+        if (EXPIRED_STATEID(stateid))
+                return nfserr_expired;
+        dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n",
+                stateid->si_boot, stateid->si_stateownerid,
+                stateid->si_fileid, stateid->si_generation);
+        return nfserr_bad_stateid;
 }
 static inline int
@@ -2867,8 +2877,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
        status = nfserr_bad_stateid;
        if (is_delegation_stateid(stateid)) {
                dp = find_delegation_stateid(ino, stateid);
-                if (!dp)
+                if (!dp) {
+                        status = stateid_error_map(stateid);
                        goto out;
+                }
                status = check_stateid_generation(stateid, &dp->dl_stateid,
                                                  flags);
                if (status)
@@ -2881,8 +2893,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                        *filpp = dp->dl_vfs_file;
        } else { /* open or lock stateid */
                stp = find_stateid(stateid, flags);
-                if (!stp)
+                if (!stp) {
+                        status = stateid_error_map(stateid);
                        goto out;
+                }
                if (nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
@@ -2956,7 +2970,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                 */
                sop = search_close_lru(stateid->si_stateownerid, flags);
                if (sop == NULL)
-                        return nfserr_bad_stateid;
+                        return stateid_error_map(stateid);
                *sopp = sop;
                goto check_replay;
        }
@@ -3227,8 +3241,10 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (!is_delegation_stateid(stateid))
                goto out;
        dp = find_delegation_stateid(inode, stateid);
-        if (!dp)
+        if (!dp) {
+                status = stateid_error_map(stateid);
                goto out;
+        }
        status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
        if (status)
                goto out;
@@ -3455,7 +3471,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = boot_time;
+        stp->st_stateid.si_boot = get_seconds();
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
@@ -3987,6 +4003,7 @@ nfs4_state_init(void)
                INIT_LIST_HEAD(&conf_str_hashtbl[i]);
                INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
                INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
+                INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
        }
        for (i = 0; i < SESSION_HASH_SIZE; i++)
                INIT_LIST_HEAD(&sessionid_hashtbl[i]);
@@ -4009,8 +4026,6 @@ nfs4_state_init(void)
        INIT_LIST_HEAD(&close_lru);
        INIT_LIST_HEAD(&client_lru);
        INIT_LIST_HEAD(&del_recall_lru);
-        for (i = 0; i < CLIENT_HASH_SIZE; i++)
-                INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
        reclaim_str_hashtbl_size = 0;
        return 0;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b73549d293be..2dcc7feaa6ff 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -83,16 +83,6 @@ check_filename(char *str, int len, __be32 err)
        return 0;
 }
-/*
- * START OF "GENERIC" DECODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
 #define DECODE_HEAD                             \
        __be32 *p;                              \
        __be32 status
@@ -254,20 +244,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
        DECODE_TAIL;
 }
-static u32 nfsd_attrmask[] = {
-        NFSD_WRITEABLE_ATTRS_WORD0,
-        NFSD_WRITEABLE_ATTRS_WORD1,
-        NFSD_WRITEABLE_ATTRS_WORD2
-};
-static u32 nfsd41_ex_attrmask[] = {
-        NFSD_SUPPATTR_EXCLCREAT_WORD0,
-        NFSD_SUPPATTR_EXCLCREAT_WORD1,
-        NFSD_SUPPATTR_EXCLCREAT_WORD2
-};
 static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                   struct iattr *iattr, struct nfs4_acl **acl)
 {
        int expected_len, len = 0;
@@ -280,18 +258,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
        if ((status = nfsd4_decode_bitmap(argp, bmval)))
                return status;
-        /*
-         * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
-         * read-only attributes return ERR_INVAL.
-         */
-        if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
-            (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
-            (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
-                return nfserr_attrnotsupp;
-        if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
-            (bmval[2] & ~writable[2]))
-                return nfserr_inval;
        READ_BUF(4);
        READ32(expected_len);
@@ -424,8 +390,11 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
                        goto xdr_error;
                }
        }
-        BUG_ON(bmval[2]);       /* no such writeable attr supported yet */
+        if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
-        if (len != expected_len)
+            || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
+            || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
+                READ_BUF(expected_len - len);
+        else if (len != expected_len)
                goto xdr_error;
        DECODE_TAIL;
@@ -518,8 +487,8 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
        if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
                return status;
-        status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
+        status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
-                                    &create->cr_iattr, &create->cr_acl);
+                                    &create->cr_acl);
        if (status)
                goto out;
@@ -682,7 +651,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                case NFS4_CREATE_UNCHECKED:
                case NFS4_CREATE_GUARDED:
                        status = nfsd4_decode_fattr(argp, open->op_bmval,
-                                nfsd_attrmask, &open->op_iattr, &open->op_acl);
+                                &open->op_iattr, &open->op_acl);
                        if (status)
                                goto out;
                        break;
@@ -696,8 +665,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                        READ_BUF(8);
                        COPYMEM(open->op_verf.data, 8);
                        status = nfsd4_decode_fattr(argp, open->op_bmval,
-                                nfsd41_ex_attrmask, &open->op_iattr,
+                                &open->op_iattr, &open->op_acl);
-                                &open->op_acl);
                        if (status)
                                goto out;
                        break;
@@ -893,8 +861,8 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
        status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
        if (status)
                return status;
-        return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
+        return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
-                                  &setattr->sa_iattr, &setattr->sa_acl);
+                                  &setattr->sa_acl);
 }
 static __be32
@@ -1328,64 +1296,64 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 };
 static nfsd4_dec nfsd41_dec_ops[] = {
-        [OP_ACCESS]             (nfsd4_dec)nfsd4_decode_access,
+        [OP_ACCESS]             = (nfsd4_dec)nfsd4_decode_access,
-        [OP_CLOSE]              (nfsd4_dec)nfsd4_decode_close,
+        [OP_CLOSE]              = (nfsd4_dec)nfsd4_decode_close,
-        [OP_COMMIT]             (nfsd4_dec)nfsd4_decode_commit,
+        [OP_COMMIT]             = (nfsd4_dec)nfsd4_decode_commit,
-        [OP_CREATE]             (nfsd4_dec)nfsd4_decode_create,
+        [OP_CREATE]             = (nfsd4_dec)nfsd4_decode_create,
-        [OP_DELEGPURGE]         (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DELEGPURGE]         = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_DELEGRETURN]        (nfsd4_dec)nfsd4_decode_delegreturn,
+        [OP_DELEGRETURN]        = (nfsd4_dec)nfsd4_decode_delegreturn,
-        [OP_GETATTR]            (nfsd4_dec)nfsd4_decode_getattr,
+        [OP_GETATTR]            = (nfsd4_dec)nfsd4_decode_getattr,
-        [OP_GETFH]              (nfsd4_dec)nfsd4_decode_noop,
+        [OP_GETFH]              = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_LINK]               (nfsd4_dec)nfsd4_decode_link,
+        [OP_LINK]               = (nfsd4_dec)nfsd4_decode_link,
-        [OP_LOCK]               (nfsd4_dec)nfsd4_decode_lock,
+        [OP_LOCK]               = (nfsd4_dec)nfsd4_decode_lock,
-        [OP_LOCKT]              (nfsd4_dec)nfsd4_decode_lockt,
+        [OP_LOCKT]              = (nfsd4_dec)nfsd4_decode_lockt,
-        [OP_LOCKU]              (nfsd4_dec)nfsd4_decode_locku,
+        [OP_LOCKU]              = (nfsd4_dec)nfsd4_decode_locku,
-        [OP_LOOKUP]             (nfsd4_dec)nfsd4_decode_lookup,
+        [OP_LOOKUP]             = (nfsd4_dec)nfsd4_decode_lookup,
-        [OP_LOOKUPP]            (nfsd4_dec)nfsd4_decode_noop,
+        [OP_LOOKUPP]            = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_NVERIFY]            (nfsd4_dec)nfsd4_decode_verify,
+        [OP_NVERIFY]            = (nfsd4_dec)nfsd4_decode_verify,
-        [OP_OPEN]               (nfsd4_dec)nfsd4_decode_open,
+        [OP_OPEN]               = (nfsd4_dec)nfsd4_decode_open,
-        [OP_OPENATTR]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPENATTR]           = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_OPEN_CONFIRM]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_OPEN_CONFIRM]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_OPEN_DOWNGRADE]     (nfsd4_dec)nfsd4_decode_open_downgrade,
+        [OP_OPEN_DOWNGRADE]     = (nfsd4_dec)nfsd4_decode_open_downgrade,
-        [OP_PUTFH]              (nfsd4_dec)nfsd4_decode_putfh,
+        [OP_PUTFH]              = (nfsd4_dec)nfsd4_decode_putfh,
-        [OP_PUTPUBFH]           (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_PUTPUBFH]           = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_PUTROOTFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_PUTROOTFH]          = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_READ]               (nfsd4_dec)nfsd4_decode_read,
+        [OP_READ]               = (nfsd4_dec)nfsd4_decode_read,
-        [OP_READDIR]            (nfsd4_dec)nfsd4_decode_readdir,
+        [OP_READDIR]            = (nfsd4_dec)nfsd4_decode_readdir,
-        [OP_READLINK]           (nfsd4_dec)nfsd4_decode_noop,
+        [OP_READLINK]           = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_REMOVE]             (nfsd4_dec)nfsd4_decode_remove,
+        [OP_REMOVE]             = (nfsd4_dec)nfsd4_decode_remove,
-        [OP_RENAME]             (nfsd4_dec)nfsd4_decode_rename,
+        [OP_RENAME]             = (nfsd4_dec)nfsd4_decode_rename,
-        [OP_RENEW]              (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RENEW]              = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_RESTOREFH]          (nfsd4_dec)nfsd4_decode_noop,
+        [OP_RESTOREFH]          = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_SAVEFH]             (nfsd4_dec)nfsd4_decode_noop,
+        [OP_SAVEFH]             = (nfsd4_dec)nfsd4_decode_noop,
-        [OP_SECINFO]            (nfsd4_dec)nfsd4_decode_secinfo,
+        [OP_SECINFO]            = (nfsd4_dec)nfsd4_decode_secinfo,
-        [OP_SETATTR]            (nfsd4_dec)nfsd4_decode_setattr,
+        [OP_SETATTR]            = (nfsd4_dec)nfsd4_decode_setattr,
-        [OP_SETCLIENTID]        (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SETCLIENTID]        = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SETCLIENTID_CONFIRM]= (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_VERIFY]             (nfsd4_dec)nfsd4_decode_verify,
+        [OP_VERIFY]             = (nfsd4_dec)nfsd4_decode_verify,
-        [OP_WRITE]              (nfsd4_dec)nfsd4_decode_write,
+        [OP_WRITE]              = (nfsd4_dec)nfsd4_decode_write,
-        [OP_RELEASE_LOCKOWNER]  (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RELEASE_LOCKOWNER]  = (nfsd4_dec)nfsd4_decode_notsupp,
        /* new operations for NFSv4.1 */
-        [OP_BACKCHANNEL_CTL]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BACKCHANNEL_CTL]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_EXCHANGE_ID]        (nfsd4_dec)nfsd4_decode_exchange_id,
+        [OP_EXCHANGE_ID]        = (nfsd4_dec)nfsd4_decode_exchange_id,
-        [OP_CREATE_SESSION]     (nfsd4_dec)nfsd4_decode_create_session,
+        [OP_CREATE_SESSION]     = (nfsd4_dec)nfsd4_decode_create_session,
-        [OP_DESTROY_SESSION]    (nfsd4_dec)nfsd4_decode_destroy_session,
+        [OP_DESTROY_SESSION]    = (nfsd4_dec)nfsd4_decode_destroy_session,
-        [OP_FREE_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_FREE_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_GETDEVICEINFO]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICEINFO]      = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_GETDEVICELIST]      (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_GETDEVICELIST]      = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_LAYOUTCOMMIT]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_LAYOUTGET]          (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_LAYOUTRETURN]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_SECINFO_NO_NAME]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_SEQUENCE]           (nfsd4_dec)nfsd4_decode_sequence,
+        [OP_SEQUENCE]           = (nfsd4_dec)nfsd4_decode_sequence,
-        [OP_SET_SSV]            (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SET_SSV]            = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_TEST_STATEID]       (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_TEST_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_WANT_DELEGATION]    (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_WANT_DELEGATION]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_DESTROY_CLIENTID]   (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DESTROY_CLIENTID]   = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_RECLAIM_COMPLETE]   (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_notsupp,
 };
 struct nfsd4_minorversion_ops {
@@ -1489,21 +1457,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
        DECODE_TAIL;
 }
-/*
- * END OF "GENERIC" DECODE ROUTINES.
- */
-/*
- * START OF "GENERIC" ENCODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define ENCODE_HEAD              __be32 *p
 #define WRITE32(n)               *p++ = htonl(n)
 #define WRITE64(n)               do {                           \
@@ -1515,13 +1468,41 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
        memcpy(p, ptr, nbytes);                                 \
        p += XDR_QUADLEN(nbytes);                               \
 }} while (0)
-#define WRITECINFO(c)           do {                            \
-        *p++ = htonl(c.atomic);                                 \
+static void write32(__be32 **p, u32 n)
-        *p++ = htonl(c.before_ctime_sec);                               \
+{
-        *p++ = htonl(c.before_ctime_nsec);                              \
+        *(*p)++ = n;
-        *p++ = htonl(c.after_ctime_sec);                                \
+}
-        *p++ = htonl(c.after_ctime_nsec);                               \
-} while (0)
+static void write64(__be32 **p, u64 n)
+{
+        write32(p, (u32)(n >> 32));
+        write32(p, (u32)n);
+}
+static void write_change(__be32 **p, struct kstat *stat, struct inode *inode)
+{
+        if (IS_I_VERSION(inode)) {
+                write64(p, inode->i_version);
+        } else {
+                write32(p, stat->ctime.tv_sec);
+                write32(p, stat->ctime.tv_nsec);
+        }
+}
+static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
+{
+        write32(p, c->atomic);
+        if (c->change_supported) {
+                write64(p, c->before_change);
+                write64(p, c->after_change);
+        } else {
+                write32(p, c->before_ctime_sec);
+                write32(p, c->before_ctime_nsec);
+                write32(p, c->after_ctime_sec);
+                write32(p, c->after_ctime_nsec);
+        }
+}
 #define RESERVE_SPACE(nbytes)   do {                            \
        p = resp->p;                                            \
@@ -1874,16 +1855,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                        WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME);
        }
        if (bmval0 & FATTR4_WORD0_CHANGE) {
-                /*
-                 * Note: This _must_ be consistent with the scheme for writing
-                 * change_info, so any changes made here must be reflected there
-                 * as well.  (See xdr4.h:set_change_info() and the WRITECINFO()
-                 * macro above.)
-                 */
                if ((buflen -= 8) < 0)
                        goto out_resource;
-                WRITE32(stat.ctime.tv_sec);
+                write_change(&p, &stat, dentry->d_inode);
-                WRITE32(stat.ctime.tv_nsec);
        }
        if (bmval0 & FATTR4_WORD0_SIZE) {
                if ((buflen -= 8) < 0)
@@ -2348,7 +2322,7 @@ fail:
 static void
 nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        RESERVE_SPACE(sizeof(stateid_t));
        WRITE32(sid->si_generation);
@@ -2359,7 +2333,7 @@ nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
 static __be32
 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(8);
@@ -2386,7 +2360,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 static __be32
 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(8);
@@ -2399,11 +2373,11 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 static __be32
 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(32);
-                WRITECINFO(create->cr_cinfo);
+                write_cinfo(&p, &create->cr_cinfo);
                WRITE32(2);
                WRITE32(create->cr_bmval[0]);
                WRITE32(create->cr_bmval[1]);
@@ -2435,7 +2409,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
 {
        struct svc_fh *fhp = *fhpp;
        unsigned int len;
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                len = fhp->fh_handle.fh_size;
@@ -2454,7 +2428,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
 static void
 nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0));
        WRITE64(ld->ld_start);
@@ -2510,11 +2484,11 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 static __be32
 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(20);
-                WRITECINFO(link->li_cinfo);
+                write_cinfo(&p, &link->li_cinfo);
                ADJUST_ARGS();
        }
        return nfserr;
@@ -2524,7 +2498,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        ENCODE_SEQID_OP_HEAD;
        if (nfserr)
@@ -2532,7 +2506,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
        nfsd4_encode_stateid(resp, &open->op_stateid);
        RESERVE_SPACE(40);
-        WRITECINFO(open->op_cinfo);
+        write_cinfo(&p, &open->op_cinfo);
        WRITE32(open->op_rflags);
        WRITE32(2);
        WRITE32(open->op_bmval[0]);
@@ -2619,7 +2593,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
        int v, pn;
        unsigned long maxcount; 
        long len;
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -2681,7 +2655,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
 {
        int maxcount;
        char *page;
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -2730,7 +2704,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
        int maxcount;
        loff_t offset;
        __be32 *page, *savep, *tailbase;
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -2806,11 +2780,11 @@ err_no_verf:
 static __be32
 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(20);
-                WRITECINFO(remove->rm_cinfo);
+                write_cinfo(&p, &remove->rm_cinfo);
                ADJUST_ARGS();
        }
        return nfserr;
@@ -2819,12 +2793,12 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 static __be32
 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(40);
-                WRITECINFO(rename->rn_sinfo);
+                write_cinfo(&p, &rename->rn_sinfo);
-                WRITECINFO(rename->rn_tinfo);
+                write_cinfo(&p, &rename->rn_tinfo);
                ADJUST_ARGS();
        }
        return nfserr;
@@ -2839,7 +2813,7 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
        u32 nflavs;
        struct exp_flavor_info *flavs;
        struct exp_flavor_info def_flavs[2];
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                goto out;
@@ -2904,7 +2878,7 @@ out:
 static __be32
 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        RESERVE_SPACE(12);
        if (nfserr) {
@@ -2924,7 +2898,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 static __be32
 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(8 + sizeof(nfs4_verifier));
@@ -2944,7 +2918,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
 static __be32
 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (!nfserr) {
                RESERVE_SPACE(16);
@@ -2960,7 +2934,7 @@ static __be32
 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
                         struct nfsd4_exchange_id *exid)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        char *major_id;
        char *server_scope;
        int major_id_sz;
@@ -3015,7 +2989,7 @@ static __be32
 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
                            struct nfsd4_create_session *sess)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -3071,7 +3045,7 @@ __be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
                      struct nfsd4_sequence *seq)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        if (nfserr)
                return nfserr;
@@ -3209,7 +3183,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
        dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
                length, xb->page_len, tlen, pad);
-        if (length <= session->se_fmaxresp_cached)
+        if (length <= session->se_fchannel.maxresp_cached)
                return status;
        else
                return nfserr_rep_too_big_to_cache;
@@ -3219,7 +3193,7 @@ void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
        __be32 *statp;
-        ENCODE_HEAD;
+        __be32 *p;
        RESERVE_SPACE(8);
        WRITE32(op->opnum);
@@ -3253,7 +3227,7 @@ status:
 void
 nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
-        ENCODE_HEAD;
+        __be32 *p;
        struct nfs4_replay *rp = op->replay;
        BUG_ON(!rp);
@@ -3268,10 +3242,6 @@ nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
        ADJUST_ARGS();
 }
-/*
- * END OF "GENERIC" ENCODE ROUTINES.
- */
 int
 nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 5bfc2ac60d54..4638635c5d87 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,15 +29,24 @@
 */
 #define CACHESIZE               1024
 #define HASHSIZE                64
-#define REQHASH(xid)            (((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
-static struct hlist_head *      hash_list;
+static struct hlist_head *      cache_hash;
 static struct list_head         lru_head;
 static int                      cache_disabled = 1;
+/*
+ * Calculate the hash index from an XID.
+ */
+static inline u32 request_hash(u32 xid)
+{
+        u32 h = xid;
+        h ^= (xid >> 24);
+        return h & (HASHSIZE-1);
+}
 static int      nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
-/* 
+/*
 * locking for the reply cache:
 * A cache entry is "single use" if c_state == RC_INPROG
 * Otherwise, it when accessing _prev or _next, the lock must be held.
@@ -62,8 +71,8 @@ int nfsd_reply_cache_init(void)
                i--;
        }
-        hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+        cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
-        if (!hash_list)
+        if (!cache_hash)
                goto out_nomem;
        cache_disabled = 0;
@@ -88,8 +97,8 @@ void nfsd_reply_cache_shutdown(void)
        cache_disabled = 1;
-        kfree (hash_list);
+        kfree (cache_hash);
-        hash_list = NULL;
+        cache_hash = NULL;
 }
 /*
@@ -108,7 +117,7 @@ static void
 hash_refile(struct svc_cacherep *rp)
 {
        hlist_del_init(&rp->c_hash);
-        hlist_add_head(&rp->c_hash, hash_list + REQHASH(rp->c_xid));
+        hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
 }
 /*
@@ -138,7 +147,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
        spin_lock(&cache_lock);
        rtn = RC_DOIT;
-        rh = &hash_list[REQHASH(xid)];
+        rh = &cache_hash[request_hash(xid)];
        hlist_for_each_entry(rp, hn, rh, c_hash) {
                if (rp->c_state != RC_UNUSED &&
                    xid == rp->c_xid && proc == rp->c_proc &&
@@ -165,8 +174,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
        }
        }
-        /* This should not happen */
+        /* All entries on the LRU are in-progress. This should not happen */
-        if (rp == NULL) {
+        if (&rp->c_lru == &lru_head) {
                static int      complaints;
                printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
@@ -264,7 +273,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
        len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
        len >>= 2;
-        
        /* Don't cache excessive amounts of data and XDR failures */
        if (!statp || len > (256 >> 2)) {
                rp->c_state = RC_UNUSED;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index af16849d243a..6d0847562d87 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -25,7 +25,6 @@
 #include <linux/init.h>
 #include <linux/inet.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/ctype.h>
 #include <linux/nfs.h>
@@ -207,10 +206,14 @@ static struct file_operations pool_stats_operations = {
 static ssize_t write_svc(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_svc *data;
+        int err;
        if (size < sizeof(*data))
                return -EINVAL;
        data = (struct nfsctl_svc*) buf;
-        return nfsd_svc(data->svc_port, data->svc_nthreads);
+        err = nfsd_svc(data->svc_port, data->svc_nthreads);
+        if (err < 0)
+                return err;
+        return 0;
 }
 /**
@@ -692,11 +695,12 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
                if (newthreads < 0)
                        return -EINVAL;
                rv = nfsd_svc(NFS_PORT, newthreads);
-                if (rv)
+                if (rv < 0)
                        return rv;
-        }
+        } else
-        sprintf(buf, "%d\n", nfsd_nrthreads());
+                rv = nfsd_nrthreads();
-        return strlen(buf);
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
 }
 /**
@@ -793,7 +797,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
        char *vers, *minorp, sign;
-        int len, num;
+        int len, num, remaining;
        unsigned minor;
        ssize_t tlen = 0;
        char *sep;
@@ -840,32 +844,50 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
                        }
                next:
                        vers += len + 1;
-                        tlen += len;
                } while ((len = qword_get(&mesg, vers, size)) > 0);
                /* If all get turned off, turn them back on, as
                 * having no versions is BAD
                 */
                nfsd_reset_versions();
        }
        /* Now write current state into reply buffer */
        len = 0;
        sep = "";
+        remaining = SIMPLE_TRANSACTION_LIMIT;
        for (num=2 ; num <= 4 ; num++)
                if (nfsd_vers(num, NFSD_AVAIL)) {
-                        len += sprintf(buf+len, "%s%c%d", sep,
+                        len = snprintf(buf, remaining, "%s%c%d", sep,
                                       nfsd_vers(num, NFSD_TEST)?'+':'-',
                                       num);
                        sep = " ";
+                        if (len > remaining)
+                                break;
+                        remaining -= len;
+                        buf += len;
+                        tlen += len;
                }
        if (nfsd_vers(4, NFSD_AVAIL))
-                for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+                for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION;
-                        len += sprintf(buf+len, " %c4.%u",
+                     minor++) {
+                        len = snprintf(buf, remaining, " %c4.%u",
                                        (nfsd_vers(4, NFSD_TEST) &&
                                         nfsd_minorversion(minor, NFSD_TEST)) ?
                                                '+' : '-',
                                        minor);
-        len += sprintf(buf+len, "\n");
-        return len;
+                        if (len > remaining)
+                                break;
+                        remaining -= len;
+                        buf += len;
+                        tlen += len;
+                }
+        len = snprintf(buf, remaining, "\n");
+        if (len > remaining)
+                return -EINVAL;
+        return tlen + len;
 }
 /**
@@ -910,104 +932,143 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
        return rv;
 }
-static ssize_t __write_ports(struct file *file, char *buf, size_t size)
+/*
+ * Zero-length write.  Return a list of NFSD's current listener
+ * transports.
+ */
+static ssize_t __write_ports_names(char *buf)
 {
-        if (size == 0) {
+        if (nfsd_serv == NULL)
-                int len = 0;
+                return 0;
+        return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
+}
-                if (nfsd_serv)
+/*
-                        len = svc_xprt_names(nfsd_serv, buf, 0);
+ * A single 'fd' number was written, in which case it must be for
-                return len;
+ * a socket of a supported family/protocol, and we use it as an
-        }
+ * nfsd listener.
-        /* Either a single 'fd' number is written, in which
+ */
-         * case it must be for a socket of a supported family/protocol,
+static ssize_t __write_ports_addfd(char *buf)
-         * and we use it as an nfsd socket, or
+{
-         * A '-' followed by the 'name' of a socket in which case
+        char *mesg = buf;
-         * we close the socket.
+        int fd, err;
-         */
-        if (isdigit(buf[0])) {
+        err = get_int(&mesg, &fd);
-                char *mesg = buf;
+        if (err != 0 || fd < 0)
-                int fd;
+                return -EINVAL;
-                int err;
-                err = get_int(&mesg, &fd);
+        err = nfsd_create_serv();
-                if (err)
+        if (err != 0)
-                        return -EINVAL;
+                return err;
-                if (fd < 0)
-                        return -EINVAL;
+        err = lockd_up();
-                err = nfsd_create_serv();
+        if (err != 0)
-                if (!err) {
+                goto out;
-                        err = svc_addsock(nfsd_serv, fd, buf);
-                        if (err >= 0) {
+        err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
-                                err = lockd_up();
+        if (err < 0)
-                                if (err < 0)
+                lockd_down();
-                                        svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
-                        }
+out:
-                        /* Decrease the count, but don't shutdown the
+        /* Decrease the count, but don't shut down the service */
-                         * the service
+        nfsd_serv->sv_nrthreads--;
-                         */
+        return err;
-                        nfsd_serv->sv_nrthreads--;
+}
-                }
-                return err < 0 ? err : 0;
+/*
-        }
+ * A '-' followed by the 'name' of a socket means we close the socket.
-        if (buf[0] == '-' && isdigit(buf[1])) {
+ */
-                char *toclose = kstrdup(buf+1, GFP_KERNEL);
+static ssize_t __write_ports_delfd(char *buf)
-                int len = 0;
+{
-                if (!toclose)
+        char *toclose;
-                        return -ENOMEM;
+        int len = 0;
-                if (nfsd_serv)
-                        len = svc_sock_names(buf, nfsd_serv, toclose);
+        toclose = kstrdup(buf + 1, GFP_KERNEL);
-                if (len >= 0)
+        if (toclose == NULL)
-                        lockd_down();
+                return -ENOMEM;
-                kfree(toclose);
-                return len;
+        if (nfsd_serv != NULL)
-        }
+                len = svc_sock_names(nfsd_serv, buf,
-        /*
+                                        SIMPLE_TRANSACTION_LIMIT, toclose);
-         * Add a transport listener by writing it's transport name
+        if (len >= 0)
-         */
+                lockd_down();
-        if (isalpha(buf[0])) {
-                int err;
+        kfree(toclose);
-                char transport[16];
+        return len;
-                int port;
+}
-                if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
-                        if (port < 1 || port > 65535)
+/*
-                                return -EINVAL;
+ * A transport listener is added by writing it's transport name and
-                        err = nfsd_create_serv();
+ * a port number.
-                        if (!err) {
+ */
-                                err = svc_create_xprt(nfsd_serv,
+static ssize_t __write_ports_addxprt(char *buf)
-                                                      transport, PF_INET, port,
+{
-                                                      SVC_SOCK_ANONYMOUS);
+        char transport[16];
-                                if (err == -ENOENT)
+        int port, err;
-                                        /* Give a reasonable perror msg for
-                                         * bad transport string */
+        if (sscanf(buf, "%15s %4u", transport, &port) != 2)
-                                        err = -EPROTONOSUPPORT;
+                return -EINVAL;
-                        }
-                        return err < 0 ? err : 0;
+        if (port < 1 || port > USHORT_MAX)
-                }
+                return -EINVAL;
-        }
-        /*
+        err = nfsd_create_serv();
-         * Remove a transport by writing it's transport name and port number
+        if (err != 0)
-         */
+                return err;
-        if (buf[0] == '-' && isalpha(buf[1])) {
-                struct svc_xprt *xprt;
+        err = svc_create_xprt(nfsd_serv, transport,
-                int err = -EINVAL;
+                                PF_INET, port, SVC_SOCK_ANONYMOUS);
-                char transport[16];
+        if (err < 0) {
-                int port;
+                /* Give a reasonable perror msg for bad transport string */
-                if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
+                if (err == -ENOENT)
-                        if (port < 1 || port > 65535)
+                        err = -EPROTONOSUPPORT;
-                                return -EINVAL;
+                return err;
-                        if (nfsd_serv) {
-                                xprt = svc_find_xprt(nfsd_serv, transport,
-                                                     AF_UNSPEC, port);
-                                if (xprt) {
-                                        svc_close_xprt(xprt);
-                                        svc_xprt_put(xprt);
-                                        err = 0;
-                                } else
-                                        err = -ENOTCONN;
-                        }
-                        return err < 0 ? err : 0;
-                }
        }
+        return 0;
+}
+/*
+ * A transport listener is removed by writing a "-", it's transport
+ * name, and it's port number.
+ */
+static ssize_t __write_ports_delxprt(char *buf)
+{
+        struct svc_xprt *xprt;
+        char transport[16];
+        int port;
+        if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
+                return -EINVAL;
+        if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL)
+                return -EINVAL;
+        xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
+        if (xprt == NULL)
+                return -ENOTCONN;
+        svc_close_xprt(xprt);
+        svc_xprt_put(xprt);
+        return 0;
+}
+static ssize_t __write_ports(struct file *file, char *buf, size_t size)
+{
+        if (size == 0)
+                return __write_ports_names(buf);
+        if (isdigit(buf[0]))
+                return __write_ports_addfd(buf);
+        if (buf[0] == '-' && isdigit(buf[1]))
+                return __write_ports_delfd(buf);
+        if (isalpha(buf[0]))
+                return __write_ports_addxprt(buf);
+        if (buf[0] == '-' && isalpha(buf[1]))
+                return __write_ports_delxprt(buf);
        return -EINVAL;
 }
@@ -1030,7 +1091,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 *                      buf:            C string containing an unsigned
 *                                      integer value representing a bound
 *                                      but unconnected socket that is to be
- *                                      used as an NFSD listener
+ *                                      used as an NFSD listener; listen(3)
+ *                                      must be called for a SOCK_STREAM
+ *                                      socket, otherwise it is ignored
 *                      size:           non-zero length of C string in @buf
 * Output:
 *      On success:     NFS service is started;
@@ -1138,7 +1201,9 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                nfsd_max_blksize = bsize;
                mutex_unlock(&nfsd_mutex);
        }
-        return sprintf(buf, "%d\n", nfsd_max_blksize);
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n",
+                                                        nfsd_max_blksize);
 }
 #ifdef CONFIG_NFSD_V4
@@ -1162,8 +1227,9 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
                        return -EINVAL;
                nfs4_reset_lease(lease);
        }
-        sprintf(buf, "%ld\n", nfs4_lease_time());
-        return strlen(buf);
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n",
+                                                        nfs4_lease_time());
 }
 /**
@@ -1219,8 +1285,9 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
                status = nfs4_reset_recoverydir(recdir);
        }
-        sprintf(buf, "%s\n", nfs4_recoverydir());
-        return strlen(buf);
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
+                                                        nfs4_recoverydir());
 }
 /**
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 9f1ca17293d3..8847f3fbfc1e 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -27,9 +27,6 @@
 #define NFSDDBG_FACILITY                NFSDDBG_FH
-static int nfsd_nr_verified;
-static int nfsd_nr_put;
 /*
 * our acceptability function.
 * if NOSUBTREECHECK, accept anything
@@ -251,7 +248,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
        fhp->fh_dentry = dentry;
        fhp->fh_export = exp;
-        nfsd_nr_verified++;
        return 0;
 out:
        exp_put(exp);
@@ -552,7 +548,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                        return nfserr_opnotsupp;
        }
-        nfsd_nr_verified++;
        return 0;
 }
@@ -609,7 +604,6 @@ fh_put(struct svc_fh *fhp)
                fhp->fh_pre_saved = 0;
                fhp->fh_post_saved = 0;
 #endif
-                nfsd_nr_put++;
        }
        if (exp) {
                cache_put(&exp->h, &svc_export_cache);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index e298e260b5f1..0eb9c820b7a6 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -533,45 +533,179 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle   *argp,
 * NFSv2 Server procedures.
 * Only the results of non-idempotent operations are cached.
 */
-#define nfsd_proc_none          NULL
-#define nfssvc_release_none     NULL
 struct nfsd_void { int dummy; };
-#define PROC(name, argt, rest, relt, cache, respsize)   \
- { (svc_procfunc) nfsd_proc_##name,             \
-   (kxdrproc_t) nfssvc_decode_##argt,           \
-   (kxdrproc_t) nfssvc_encode_##rest,           \
-   (kxdrproc_t) nfssvc_release_##relt,          \
-   sizeof(struct nfsd_##argt),                  \
-   sizeof(struct nfsd_##rest),                  \
-   0,                                           \
-   cache,                                       \
-   respsize,                                    \
- }
 #define ST 1            /* status */
 #define FH 8            /* filehandle */
 #define AT 18           /* attributes */
 static struct svc_procedure             nfsd_procedures2[18] = {
-  PROC(null,     void,          void,           none,           RC_NOCACHE, ST),
+        [NFSPROC_NULL] = {
-  PROC(getattr,  fhandle,       attrstat,       fhandle,        RC_NOCACHE, ST+AT),
+                .pc_func = (svc_procfunc) nfsd_proc_null,
-  PROC(setattr,  sattrargs,     attrstat,       fhandle,        RC_REPLBUFF, ST+AT),
+                .pc_decode = (kxdrproc_t) nfssvc_decode_void,
-  PROC(none,     void,          void,           none,           RC_NOCACHE, ST),
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
-  PROC(lookup,   diropargs,     diropres,       fhandle,        RC_NOCACHE, ST+FH+AT),
+                .pc_argsize = sizeof(struct nfsd_void),
-  PROC(readlink, readlinkargs,  readlinkres,    none,           RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
+                .pc_ressize = sizeof(struct nfsd_void),
-  PROC(read,     readargs,      readres,        fhandle,        RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4),
+                .pc_cachetype = RC_NOCACHE,
-  PROC(none,     void,          void,           none,           RC_NOCACHE, ST),
+                .pc_xdrressize = ST,
-  PROC(write,    writeargs,     attrstat,       fhandle,        RC_REPLBUFF, ST+AT),
+        },
-  PROC(create,   createargs,    diropres,       fhandle,        RC_REPLBUFF, ST+FH+AT),
+        [NFSPROC_GETATTR] = {
-  PROC(remove,   diropargs,     void,           none,           RC_REPLSTAT, ST),
+                .pc_func = (svc_procfunc) nfsd_proc_getattr,
-  PROC(rename,   renameargs,    void,           none,           RC_REPLSTAT, ST),
+                .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
-  PROC(link,     linkargs,      void,           none,           RC_REPLSTAT, ST),
+                .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
-  PROC(symlink,  symlinkargs,   void,           none,           RC_REPLSTAT, ST),
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
-  PROC(mkdir,    createargs,    diropres,       fhandle,        RC_REPLBUFF, ST+FH+AT),
+                .pc_argsize = sizeof(struct nfsd_fhandle),
-  PROC(rmdir,    diropargs,     void,           none,           RC_REPLSTAT, ST),
+                .pc_ressize = sizeof(struct nfsd_attrstat),
-  PROC(readdir,  readdirargs,   readdirres,     none,           RC_NOCACHE, 0),
+                .pc_cachetype = RC_NOCACHE,
-  PROC(statfs,   fhandle,       statfsres,      none,           RC_NOCACHE, ST+5),
+                .pc_xdrressize = ST+AT,
+        },
+        [NFSPROC_SETATTR] = {
+                .pc_func = (svc_procfunc) nfsd_proc_setattr,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_sattrargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_sattrargs),
+                .pc_ressize = sizeof(struct nfsd_attrstat),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+AT,
+        },
+        [NFSPROC_ROOT] = {
+                .pc_decode = (kxdrproc_t) nfssvc_decode_void,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_void),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_LOOKUP] = {
+                .pc_func = (svc_procfunc) nfsd_proc_lookup,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_diropargs),
+                .pc_ressize = sizeof(struct nfsd_diropres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+FH+AT,
+        },
+        [NFSPROC_READLINK] = {
+                .pc_func = (svc_procfunc) nfsd_proc_readlink,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_readlinkargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_readlinkres,
+                .pc_argsize = sizeof(struct nfsd_readlinkargs),
+                .pc_ressize = sizeof(struct nfsd_readlinkres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
+        },
+        [NFSPROC_READ] = {
+                .pc_func = (svc_procfunc) nfsd_proc_read,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_readargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_readres,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_readargs),
+                .pc_ressize = sizeof(struct nfsd_readres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+        },
+        [NFSPROC_WRITECACHE] = {
+                .pc_decode = (kxdrproc_t) nfssvc_decode_void,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_void),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_WRITE] = {
+                .pc_func = (svc_procfunc) nfsd_proc_write,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_writeargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_writeargs),
+                .pc_ressize = sizeof(struct nfsd_attrstat),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+AT,
+        },
+        [NFSPROC_CREATE] = {
+                .pc_func = (svc_procfunc) nfsd_proc_create,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_createargs),
+                .pc_ressize = sizeof(struct nfsd_diropres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+FH+AT,
+        },
+        [NFSPROC_REMOVE] = {
+                .pc_func = (svc_procfunc) nfsd_proc_remove,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_diropargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_RENAME] = {
+                .pc_func = (svc_procfunc) nfsd_proc_rename,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_renameargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_renameargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_LINK] = {
+                .pc_func = (svc_procfunc) nfsd_proc_link,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_linkargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_linkargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_SYMLINK] = {
+                .pc_func = (svc_procfunc) nfsd_proc_symlink,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_symlinkargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_symlinkargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_MKDIR] = {
+                .pc_func = (svc_procfunc) nfsd_proc_mkdir,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_createargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_diropres,
+                .pc_release = (kxdrproc_t) nfssvc_release_fhandle,
+                .pc_argsize = sizeof(struct nfsd_createargs),
+                .pc_ressize = sizeof(struct nfsd_diropres),
+                .pc_cachetype = RC_REPLBUFF,
+                .pc_xdrressize = ST+FH+AT,
+        },
+        [NFSPROC_RMDIR] = {
+                .pc_func = (svc_procfunc) nfsd_proc_rmdir,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_void,
+                .pc_argsize = sizeof(struct nfsd_diropargs),
+                .pc_ressize = sizeof(struct nfsd_void),
+                .pc_cachetype = RC_REPLSTAT,
+                .pc_xdrressize = ST,
+        },
+        [NFSPROC_READDIR] = {
+                .pc_func = (svc_procfunc) nfsd_proc_readdir,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_readdirargs,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_readdirres,
+                .pc_argsize = sizeof(struct nfsd_readdirargs),
+                .pc_ressize = sizeof(struct nfsd_readdirres),
+                .pc_cachetype = RC_NOCACHE,
+        },
+        [NFSPROC_STATFS] = {
+                .pc_func = (svc_procfunc) nfsd_proc_statfs,
+                .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle,
+                .pc_encode = (kxdrproc_t) nfssvc_encode_statfsres,
+                .pc_argsize = sizeof(struct nfsd_fhandle),
+                .pc_ressize = sizeof(struct nfsd_statfsres),
+                .pc_cachetype = RC_NOCACHE,
+                .pc_xdrressize = ST+5,
+        },
 };
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cbba4a935786..492c79b7800b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -18,7 +18,6 @@
 #include <linux/unistd.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
@@ -390,12 +389,14 @@ nfsd_svc(unsigned short port, int nrservs)
        mutex_lock(&nfsd_mutex);
        dprintk("nfsd: creating service\n");
-        error = -EINVAL;
        if (nrservs <= 0)
                nrservs = 0;
        if (nrservs > NFSD_MAXSERVS)
                nrservs = NFSD_MAXSERVS;
-        
+        error = 0;
+        if (nrservs == 0 && nfsd_serv == NULL)
+                goto out;
        /* Readahead param cache - will no-op if it already exists */
        error = nfsd_racache_init(2*nrservs);
        if (error<0)
@@ -413,6 +414,12 @@ nfsd_svc(unsigned short port, int nrservs)
                goto failure;
        error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
+        if (error == 0)
+                /* We are holding a reference to nfsd_serv which
+                 * we don't want to count in the return value,
+                 * so subtract 1
+                 */
+                error = nfsd_serv->sv_nrthreads - 1;
 failure:
        svc_destroy(nfsd_serv);         /* Release server */
 out:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b660435978d2..23341c1063bc 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -55,6 +55,7 @@
 #include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
 #include <linux/jhash.h>
+#include <linux/ima.h>
 #include <asm/uaccess.h>
@@ -100,36 +101,35 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 {
        struct svc_export *exp = *expp, *exp2 = NULL;
        struct dentry *dentry = *dpp;
-        struct vfsmount *mnt = mntget(exp->ex_path.mnt);
+        struct path path = {.mnt = mntget(exp->ex_path.mnt),
-        struct dentry *mounts = dget(dentry);
+                            .dentry = dget(dentry)};
        int err = 0;
-        while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
+        while (d_mountpoint(path.dentry) && follow_down(&path))
+                ;
-        exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
+        exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
                if (PTR_ERR(exp2) != -ENOENT)
                        err = PTR_ERR(exp2);
-                dput(mounts);
+                path_put(&path);
-                mntput(mnt);
                goto out;
        }
        if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
                /* successfully crossed mount point */
                /*
-                 * This is subtle: dentry is *not* under mnt at this point.
+                 * This is subtle: path.dentry is *not* on path.mnt
-                 * The only reason we are safe is that original mnt is pinned
+                 * at this point.  The only reason we are safe is that
-                 * down by exp, so we should dput before putting exp.
+                 * original mnt is pinned down by exp, so we should
+                 * put path *before* putting exp
                 */
-                dput(dentry);
+                *dpp = path.dentry;
-                *dpp = mounts;
+                path.dentry = dentry;
-                exp_put(exp);
                *expp = exp2;
-        } else {
+                exp2 = exp;
-                exp_put(exp2);
-                dput(mounts);
        }
-        mntput(mnt);
+        path_put(&path);
+        exp_put(exp2);
 out:
        return err;
 }
@@ -168,28 +168,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        /* checking mountpoint crossing is very different when stepping up */
                        struct svc_export *exp2 = NULL;
                        struct dentry *dp;
-                        struct vfsmount *mnt = mntget(exp->ex_path.mnt);
+                        struct path path = {.mnt = mntget(exp->ex_path.mnt),
-                        dentry = dget(dparent);
+                                            .dentry = dget(dparent)};
-                        while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
+                        while (path.dentry == path.mnt->mnt_root &&
+                               follow_up(&path))
                                ;
-                        dp = dget_parent(dentry);
+                        dp = dget_parent(path.dentry);
-                        dput(dentry);
+                        dput(path.dentry);
-                        dentry = dp;
+                        path.dentry = dp;
-                        exp2 = rqst_exp_parent(rqstp, mnt, dentry);
+                        exp2 = rqst_exp_parent(rqstp, &path);
                        if (PTR_ERR(exp2) == -ENOENT) {
-                                dput(dentry);
                                dentry = dget(dparent);
                        } else if (IS_ERR(exp2)) {
                                host_err = PTR_ERR(exp2);
-                                dput(dentry);
+                                path_put(&path);
-                                mntput(mnt);
                                goto out_nfserr;
                        } else {
+                                dentry = dget(path.dentry);
                                exp_put(exp);
                                exp = exp2;
                        }
-                        mntput(mnt);
+                        path_put(&path);
                }
        } else {
                fh_lock(fhp);
@@ -677,7 +678,6 @@ __be32
 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                        int access, struct file **filp)
 {
-        const struct cred *cred = current_cred();
        struct dentry   *dentry;
        struct inode    *inode;
        int             flags = O_RDONLY|O_LARGEFILE;
@@ -732,9 +732,11 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                vfs_dq_init(inode);
        }
        *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
-                            flags, cred);
+                            flags, current_cred());
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
+        else
+                ima_counts_get(*filp);
 out_nfserr:
        err = nfserrno(host_err);
 out:
@@ -963,6 +965,43 @@ static void kill_suid(struct dentry *dentry)
        mutex_unlock(&dentry->d_inode->i_mutex);
 }
+/*
+ * Gathered writes: If another process is currently writing to the file,
+ * there's a high chance this is another nfsd (triggered by a bulk write
+ * from a client's biod). Rather than syncing the file with each write
+ * request, we sleep for 10 msec.
+ *
+ * I don't know if this roughly approximates C. Juszak's idea of
+ * gathered writes, but it's a nice and simple solution (IMHO), and it
+ * seems to work:-)
+ *
+ * Note: we do this only in the NFSv2 case, since v3 and higher have a
+ * better tool (separate unstable writes and commits) for solving this
+ * problem.
+ */
+static int wait_for_concurrent_writes(struct file *file)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        static ino_t last_ino;
+        static dev_t last_dev;
+        int err = 0;
+        if (atomic_read(&inode->i_writecount) > 1
+            || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
+                dprintk("nfsd: write defer %d\n", task_pid_nr(current));
+                msleep(10);
+                dprintk("nfsd: write resume %d\n", task_pid_nr(current));
+        }
+        if (inode->i_state & I_DIRTY) {
+                dprintk("nfsd: write sync %d\n", task_pid_nr(current));
+                err = nfsd_sync(file);
+        }
+        last_ino = inode->i_ino;
+        last_dev = inode->i_sb->s_dev;
+        return err;
+}
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                                loff_t offset, struct kvec *vec, int vlen,
@@ -975,6 +1014,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        __be32                  err = 0;
        int                     host_err;
        int                     stable = *stablep;
+        int                     use_wgather;
 #ifdef MSNFS
        err = nfserr_perm;
@@ -993,9 +1033,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
         *  -   the sync export option has been set, or
         *  -   the client requested O_SYNC behavior (NFSv3 feature).
         *  -   The file system doesn't support fsync().
-         * When gathered writes have been configured for this volume,
+         * When NFSv2 gathered writes have been configured for this volume,
         * flushing the data to disk is handled separately below.
         */
+        use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
        if (!file->f_op->fsync) {/* COMMIT3 cannot work */
               stable = 2;
@@ -1004,7 +1045,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        if (!EX_ISSYNC(exp))
                stable = 0;
-        if (stable && !EX_WGATHER(exp)) {
+        if (stable && !use_wgather) {
                spin_lock(&file->f_lock);
                file->f_flags |= O_SYNC;
                spin_unlock(&file->f_lock);
@@ -1014,52 +1055,20 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        oldfs = get_fs(); set_fs(KERNEL_DS);
        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
        set_fs(oldfs);
-        if (host_err >= 0) {
+        if (host_err < 0)
-                *cnt = host_err;
+                goto out_nfserr;
-                nfsdstats.io_write += host_err;
+        *cnt = host_err;
-                fsnotify_modify(file->f_path.dentry);
+        nfsdstats.io_write += host_err;
-        }
+        fsnotify_modify(file->f_path.dentry);
        /* clear setuid/setgid flag after write */
-        if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+        if (inode->i_mode & (S_ISUID | S_ISGID))
                kill_suid(dentry);
-        if (host_err >= 0 && stable) {
+        if (stable && use_wgather)
-                static ino_t    last_ino;
+                host_err = wait_for_concurrent_writes(file);
-                static dev_t    last_dev;
-                /*
-                 * Gathered writes: If another process is currently
-                 * writing to the file, there's a high chance
-                 * this is another nfsd (triggered by a bulk write
-                 * from a client's biod). Rather than syncing the
-                 * file with each write request, we sleep for 10 msec.
-                 *
-                 * I don't know if this roughly approximates
-                 * C. Juszak's idea of gathered writes, but it's a
-                 * nice and simple solution (IMHO), and it seems to
-                 * work:-)
-                 */
-                if (EX_WGATHER(exp)) {
-                        if (atomic_read(&inode->i_writecount) > 1
-                            || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
-                                dprintk("nfsd: write defer %d\n", task_pid_nr(current));
-                                msleep(10);
-                                dprintk("nfsd: write resume %d\n", task_pid_nr(current));
-                        }
-                        if (inode->i_state & I_DIRTY) {
-                                dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-                                host_err=nfsd_sync(file);
-                        }
-#if 0
-                        wake_up(&inode->i_wait);
-#endif
-                }
-                last_ino = inode->i_ino;
-                last_dev = inode->i_sb->s_dev;
-        }
+out_nfserr:
        dprintk("nfsd: write complete host_err=%d\n", host_err);
        if (host_err >= 0)
                err = 0;
@@ -2024,6 +2033,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                                        struct dentry *dentry, int acc)
 {
        struct inode    *inode = dentry->d_inode;
+        struct path     path;
        int             err;
        if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2106,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
            acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
                err = inode_permission(inode, MAY_EXEC);
+        if (err)
+                goto nfsd_out;
+        /* Do integrity (permission) checking now, but defer incrementing
+         * IMA counts to the actual file open.
+         */
+        path.mnt = exp->ex_path.mnt;
+        path.dentry = dentry;
+        err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                             IMA_COUNT_LEAVE);
+nfsd_out:
        return err? nfserrno(err) : 0;
 }
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
new file mode 100644
index 000000000000..72da095d4009
--- /dev/null
+++ b/fs/nilfs2/Kconfig
@@ -0,0 +1,25 @@
+config NILFS2_FS
+        tristate "NILFS2 file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        select CRC32
+        help
+          NILFS2 is a log-structured file system (LFS) supporting continuous
+          snapshotting.  In addition to versioning capability of the entire
+          file system, users can even restore files mistakenly overwritten or
+          destroyed just a few seconds ago.  Since this file system can keep
+          consistency like conventional LFS, it achieves quick recovery after
+          system crashes.
+          NILFS2 creates a number of checkpoints every few seconds or per
+          synchronous write basis (unless there is no change).  Users can
+          select significant versions among continuously created checkpoints,
+          and can change them into snapshots which will be preserved for long
+          periods until they are changed back to checkpoints.  Each
+          snapshot is mountable as a read-only file system concurrently with
+          its writable mount, and this feature is convenient for online backup.
+          Some features including atime, extended attributes, and POSIX ACLs,
+          are not supported yet.
+          To compile this file system support as a module, choose M here: the
+          module will be called nilfs2.  If unsure, say N.
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 064279e33bbb..99d58a028b94 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -31,21 +31,26 @@
 #include "dat.h"
 #include "alloc.h"
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+}
 int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
                               __u64 *ptrp)
 {
-        __u64 ptr;
+        sector_t blocknr;
        int ret;
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
        if (ret < 0)
                goto out;
-        if (bmap->b_pops->bpop_translate != NULL) {
+        if (NILFS_BMAP_USE_VBN(bmap)) {
-                ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
+                ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
-                if (ret < 0)
+                                          &blocknr);
-                        goto out;
+                if (!ret)
-                *ptrp = ptr;
+                        *ptrp = blocknr;
        }
 out:
@@ -53,6 +58,16 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
        return ret;
 }
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
+                             unsigned maxblocks)
+{
+        int ret;
+        down_read(&bmap->b_sem);
+        ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
+        up_read(&bmap->b_sem);
+        return ret;
+}
 /**
 * nilfs_bmap_lookup - find a record
@@ -101,8 +116,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
                        if (n < 0)
                                return n;
                        ret = nilfs_btree_convert_and_insert(
-                                bmap, key, ptr, keys, ptrs, n,
+                                bmap, key, ptr, keys, ptrs, n);
-                                NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
                        if (ret == 0)
                                bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
@@ -158,8 +172,7 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
                        if (n < 0)
                                return n;
                        ret = nilfs_direct_delete_and_convert(
-                                bmap, key, keys, ptrs, n,
+                                bmap, key, keys, ptrs, n);
-                                NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
                        if (ret == 0)
                                bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
@@ -417,38 +430,6 @@ void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
                mark_inode_dirty(bmap->b_inode);
 }
-int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
-                         struct buffer_head **bhp)
-{
-        return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
-                                ptr, 0, bhp, 0);
-}
-void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
-                          struct buffer_head *bh)
-{
-        brelse(bh);
-}
-int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
-                             struct buffer_head **bhp)
-{
-        int ret;
-        ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
-                               ptr, 0, bhp, 1);
-        if (ret < 0)
-                return ret;
-        set_buffer_nilfs_volatile(*bhp);
-        return 0;
-}
-void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
-                             struct buffer_head *bh)
-{
-        nilfs_btnode_delete(bh);
-}
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
                              const struct buffer_head *bh)
 {
@@ -476,11 +457,6 @@ __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
                return NILFS_BMAP_INVALID_PTR;
 }
-static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
-{
-        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
-}
 #define NILFS_BMAP_GROUP_DIV    8
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
 {
@@ -493,64 +469,51 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
                (entries_per_group / NILFS_BMAP_GROUP_DIV);
 }
-static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
+int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req)
+                                 union nilfs_bmap_ptr_req *req)
 {
        return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req)
+                                 union nilfs_bmap_ptr_req *req)
 {
        nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
-                                     union nilfs_bmap_ptr_req *req)
+                              union nilfs_bmap_ptr_req *req)
 {
        nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
+int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
-                                      union nilfs_bmap_ptr_req *req)
+                       sector_t blocknr)
 {
-        return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
-}
+        int ret;
-static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req,
-                                      sector_t blocknr)
-{
-        nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
-                               blocknr);
-}
-static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
+        ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
-                                     union nilfs_bmap_ptr_req *req)
+        if (likely(!ret))
-{
+                nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
-        nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+        return ret;
 }
-static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
+int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
-                                    union nilfs_bmap_ptr_req *req)
+                             union nilfs_bmap_ptr_req *req)
 {
        return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
-                                    union nilfs_bmap_ptr_req *req)
+                             union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
-}
-static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
-                                       union nilfs_bmap_ptr_req *req)
 {
-        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
+        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
+                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
-static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
-                                   union nilfs_bmap_ptr_req *req)
+                            union nilfs_bmap_ptr_req *req)
 {
        nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
@@ -566,129 +529,46 @@ int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
 }
-int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
+int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
-                              union nilfs_bmap_ptr_req *oldreq,
+                                union nilfs_bmap_ptr_req *oldreq,
-                              union nilfs_bmap_ptr_req *newreq)
+                                union nilfs_bmap_ptr_req *newreq)
 {
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
        int ret;
-        ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
+        ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
        if (ret < 0)
                return ret;
-        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
+        ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
        if (ret < 0)
-                bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+                nilfs_dat_abort_end(dat, &oldreq->bpr_req);
        return ret;
 }
-void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
+void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
-                              union nilfs_bmap_ptr_req *oldreq,
+                                union nilfs_bmap_ptr_req *oldreq,
-                              union nilfs_bmap_ptr_req *newreq)
+                                union nilfs_bmap_ptr_req *newreq)
 {
-        bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
-}
-void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
+        nilfs_dat_commit_end(dat, &oldreq->bpr_req,
-                             union nilfs_bmap_ptr_req *oldreq,
+                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-                             union nilfs_bmap_ptr_req *newreq)
+        nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
-{
-        bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
-        bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
 }
-static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
+void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
-                                  __u64 *ptrp)
+                               union nilfs_bmap_ptr_req *oldreq,
+                               union nilfs_bmap_ptr_req *newreq)
 {
-        sector_t blocknr;
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        int ret;
-        ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
-        if (ret < 0)
-                return ret;
-        if (ptrp != NULL)
-                *ptrp = blocknr;
-        return 0;
-}
-static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
+        nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-                                      union nilfs_bmap_ptr_req *req)
+        nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
-{
-        /* ignore target ptr */
-        req->bpr_ptr = bmap->b_last_allocated_ptr++;
-        return 0;
 }
-static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req)
-{
-        /* do nothing */
-}
-static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
-                                     union nilfs_bmap_ptr_req *req)
-{
-        bmap->b_last_allocated_ptr--;
-}
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
-        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
-        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
-        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
-        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
-        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
-        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
-        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
-        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_v,
-        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
-        .bpop_translate         =       nilfs_bmap_translate_v,
-};
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
-        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
-        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
-        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
-        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
-        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
-        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
-        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
-        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_vmdt,
-        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
-        .bpop_translate         =       nilfs_bmap_translate_v,
-};
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
-        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_p,
-        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_p,
-        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_p,
-        .bpop_prepare_start_ptr =       NULL,
-        .bpop_commit_start_ptr  =       NULL,
-        .bpop_abort_start_ptr   =       NULL,
-        .bpop_prepare_end_ptr   =       NULL,
-        .bpop_commit_end_ptr    =       NULL,
-        .bpop_abort_end_ptr     =       NULL,
-        .bpop_translate         =       NULL,
-};
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
-        .bpop_prepare_alloc_ptr =       NULL,
-        .bpop_commit_alloc_ptr  =       NULL,
-        .bpop_abort_alloc_ptr   =       NULL,
-        .bpop_prepare_start_ptr =       NULL,
-        .bpop_commit_start_ptr  =       NULL,
-        .bpop_abort_start_ptr   =       NULL,
-        .bpop_prepare_end_ptr   =       NULL,
-        .bpop_commit_end_ptr    =       NULL,
-        .bpop_abort_end_ptr     =       NULL,
-        .bpop_translate         =       NULL,
-};
 static struct lock_class_key nilfs_bmap_dat_lock_key;
+static struct lock_class_key nilfs_bmap_mdt_lock_key;
 /**
 * nilfs_bmap_read - read a bmap from an inode
@@ -714,31 +594,30 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
        switch (bmap->b_inode->i_ino) {
        case NILFS_DAT_INO:
-                bmap->b_pops = &nilfs_bmap_ptr_ops_p;
+                bmap->b_ptr_type = NILFS_BMAP_PTR_P;
-                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
                break;
        case NILFS_CPFILE_INO:
        case NILFS_SUFILE_INO:
-                bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
+                bmap->b_ptr_type = NILFS_BMAP_PTR_VS;
-                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
                break;
+        case NILFS_IFILE_INO:
+                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key);
+                /* Fall through */
        default:
-                bmap->b_pops = &nilfs_bmap_ptr_ops_v;
+                bmap->b_ptr_type = NILFS_BMAP_PTR_VM;
-                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
                break;
        }
        return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
-                nilfs_btree_init(bmap,
+                nilfs_btree_init(bmap) : nilfs_direct_init(bmap);
-                                 NILFS_BMAP_LARGE_LOW,
-                                 NILFS_BMAP_LARGE_HIGH) :
-                nilfs_direct_init(bmap,
-                                  NILFS_BMAP_SMALL_LOW,
-                                  NILFS_BMAP_SMALL_HIGH);
 }
 /**
@@ -764,7 +643,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
        memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
        init_rwsem(&bmap->b_sem);
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
-        bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
+        bmap->b_ptr_type = NILFS_BMAP_PTR_U;
        bmap->b_last_allocated_key = 0;
        bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
        bmap->b_state = 0;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 4f2708abb1ba..b2890cdcef12 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -64,6 +64,8 @@ struct nilfs_bmap_stats {
 */
 struct nilfs_bmap_operations {
        int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+        int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
+                                 unsigned);
        int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
        int (*bop_delete)(struct nilfs_bmap *, __u64);
        void (*bop_clear)(struct nilfs_bmap *);
@@ -86,34 +88,6 @@ struct nilfs_bmap_operations {
 };
-/**
- * struct nilfs_bmap_ptr_operations - bmap ptr operation table
- */
-struct nilfs_bmap_ptr_operations {
-        int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *);
-        void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *);
-        void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
-                                     union nilfs_bmap_ptr_req *);
-        int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *);
-        void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *,
-                                      sector_t);
-        void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
-                                     union nilfs_bmap_ptr_req *);
-        int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
-                                    union nilfs_bmap_ptr_req *);
-        void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
-                                    union nilfs_bmap_ptr_req *);
-        void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
-                                   union nilfs_bmap_ptr_req *);
-        int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
-};
 #define NILFS_BMAP_SIZE         (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
 #define NILFS_BMAP_KEY_BIT      (sizeof(unsigned long) * 8 /* CHAR_BIT */)
 #define NILFS_BMAP_NEW_PTR_INIT \
@@ -131,11 +105,9 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
 * @b_sem: semaphore
 * @b_inode: owner of bmap
 * @b_ops: bmap operation table
- * @b_pops: bmap ptr operation table
- * @b_low: low watermark of conversion
- * @b_high: high watermark of conversion
 * @b_last_allocated_key: last allocated key for data block
 * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_ptr_type: pointer type
 * @b_state: state
 */
 struct nilfs_bmap {
@@ -146,14 +118,22 @@ struct nilfs_bmap {
        struct rw_semaphore b_sem;
        struct inode *b_inode;
        const struct nilfs_bmap_operations *b_ops;
-        const struct nilfs_bmap_ptr_operations *b_pops;
-        __u64 b_low;
-        __u64 b_high;
        __u64 b_last_allocated_key;
        __u64 b_last_allocated_ptr;
+        int b_ptr_type;
        int b_state;
 };
+/* pointer type */
+#define NILFS_BMAP_PTR_P        0       /* physical block number (i.e. LBN) */
+#define NILFS_BMAP_PTR_VS       1       /* virtual block number (single
+                                           version) */
+#define NILFS_BMAP_PTR_VM       2       /* virtual block number (has multiple
+                                           versions) */
+#define NILFS_BMAP_PTR_U        (-1)    /* never perform pointer operations */
+#define NILFS_BMAP_USE_VBN(bmap)        ((bmap)->b_ptr_type > 0)
 /* state */
 #define NILFS_BMAP_DIRTY        0x00000001
@@ -162,6 +142,7 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
 int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
 int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
 int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
@@ -182,7 +163,67 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 /*
 * Internal use only
 */
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
+int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
+                               union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
+                               union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
+                              union nilfs_bmap_ptr_req *);
+static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
+                                               union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                return nilfs_bmap_prepare_alloc_v(bmap, req);
+        /* ignore target ptr */
+        req->bpr_ptr = bmap->b_last_allocated_ptr++;
+        return 0;
+}
+static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
+                                               union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_commit_alloc_v(bmap, req);
+}
+static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
+                                              union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_abort_alloc_v(bmap, req);
+        else
+                bmap->b_last_allocated_ptr--;
+}
+int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
+                                             union nilfs_bmap_ptr_req *req)
+{
+        return NILFS_BMAP_USE_VBN(bmap) ?
+                nilfs_bmap_prepare_end_v(bmap, req) : 0;
+}
+static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
+                                             union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_commit_end_v(bmap, req);
+}
+static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
+                                            union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_abort_end_v(bmap, req);
+}
+int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
+                       sector_t);
 int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
 int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
@@ -193,28 +234,20 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_update(struct nilfs_bmap *,
+int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
-                              union nilfs_bmap_ptr_req *,
+                                union nilfs_bmap_ptr_req *,
-                              union nilfs_bmap_ptr_req *);
+                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update(struct nilfs_bmap *,
+void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
-                              union nilfs_bmap_ptr_req *,
+                                union nilfs_bmap_ptr_req *,
-                              union nilfs_bmap_ptr_req *);
+                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update(struct nilfs_bmap *,
+void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
-                             union nilfs_bmap_ptr_req *,
+                               union nilfs_bmap_ptr_req *,
-                             union nilfs_bmap_ptr_req *);
+                               union nilfs_bmap_ptr_req *);
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
-int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
-                         struct buffer_head **);
-void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
-int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
-                             struct buffer_head **);
-void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
 /* Assume that bmap semaphore is locked. */
 static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
 {
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4cc07b2c30e0..7e0b61be212e 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -46,15 +46,18 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
        INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
 }
-static struct address_space_operations def_btnode_aops;
+static struct address_space_operations def_btnode_aops = {
+        .sync_page              = block_sync_page,
+};
-void nilfs_btnode_cache_init(struct address_space *btnc)
+void nilfs_btnode_cache_init(struct address_space *btnc,
+                             struct backing_dev_info *bdi)
 {
        btnc->host = NULL;  /* can safely set to host inode ? */
        btnc->flags = 0;
        mapping_set_gfp_mask(btnc, GFP_NOFS);
        btnc->assoc_mapping = NULL;
-        btnc->backing_dev_info = &default_backing_dev_info;
+        btnc->backing_dev_info = bdi;
        btnc->a_ops = &def_btnode_aops;
 }
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 35faa86444a7..3e2275172ed6 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -38,7 +38,7 @@ struct nilfs_btnode_chkey_ctxt {
 };
 void nilfs_btnode_cache_init_once(struct address_space *);
-void nilfs_btnode_cache_init(struct address_space *);
+void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
                              struct buffer_head **, int);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 6b37a2767293..aa412724b64e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -29,6 +29,7 @@
 #include "btnode.h"
 #include "btree.h"
 #include "alloc.h"
+#include "dat.h"
 /**
 * struct nilfs_btree_path - A path on which B-tree operations are executed
@@ -109,8 +110,7 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
             level < NILFS_BTREE_LEVEL_MAX;
             level++) {
                if (path[level].bp_bh != NULL) {
-                        nilfs_bmap_put_block(&btree->bt_bmap,
+                        brelse(path[level].bp_bh);
-                                             path[level].bp_bh);
                        path[level].bp_bh = NULL;
                }
                /* sib_bh is released or deleted by prepare or commit
@@ -123,10 +123,29 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
        }
 }
 /*
 * B-tree node operations
 */
+static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
+                                 struct buffer_head **bhp)
+{
+        struct address_space *btnc =
+                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
+        return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
+}
+static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
+                                     __u64 ptr, struct buffer_head **bhp)
+{
+        struct address_space *btnc =
+                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
+        int ret;
+        ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
+        if (!ret)
+                set_buffer_nilfs_volatile(*bhp);
+        return ret;
+}
 static inline int
 nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
@@ -488,8 +507,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
        path[level].bp_index = index;
        for (level--; level >= minlevel; level--) {
-                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
-                                           &path[level].bp_bh);
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -535,8 +553,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
        path[level].bp_index = index;
        for (level--; level > 0; level--) {
-                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
-                                           &path[level].bp_bh);
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -579,6 +596,87 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
        return ret;
 }
+static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
+                                     __u64 key, __u64 *ptrp, unsigned maxblocks)
+{
+        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+        struct nilfs_btree_path *path;
+        struct nilfs_btree_node *node;
+        struct inode *dat = NULL;
+        __u64 ptr, ptr2;
+        sector_t blocknr;
+        int level = NILFS_BTREE_LEVEL_NODE_MIN;
+        int ret, cnt, index, maxlevel;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+        if (ret < 0)
+                goto out;
+        if (NILFS_BMAP_USE_VBN(bmap)) {
+                dat = nilfs_bmap_get_dat(bmap);
+                ret = nilfs_dat_translate(dat, ptr, &blocknr);
+                if (ret < 0)
+                        goto out;
+                ptr = blocknr;
+        }
+        cnt = 1;
+        if (cnt == maxblocks)
+                goto end;
+        maxlevel = nilfs_btree_height(btree) - 1;
+        node = nilfs_btree_get_node(btree, path, level);
+        index = path[level].bp_index + 1;
+        for (;;) {
+                while (index < nilfs_btree_node_get_nchildren(btree, node)) {
+                        if (nilfs_btree_node_get_key(btree, node, index) !=
+                            key + cnt)
+                                goto end;
+                        ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                        if (dat) {
+                                ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+                                if (ret < 0)
+                                        goto out;
+                                ptr2 = blocknr;
+                        }
+                        if (ptr2 != ptr + cnt || ++cnt == maxblocks)
+                                goto end;
+                        index++;
+                        continue;
+                }
+                if (level == maxlevel)
+                        break;
+                /* look-up right sibling node */
+                node = nilfs_btree_get_node(btree, path, level + 1);
+                index = path[level + 1].bp_index + 1;
+                if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
+                    nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+                        break;
+                ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                path[level + 1].bp_index = index;
+                brelse(path[level].bp_bh);
+                path[level].bp_bh = NULL;
+                ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
+                if (ret < 0)
+                        goto out;
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                index = 0;
+                path[level].bp_index = index;
+        }
+ end:
+        *ptrp = ptr;
+        ret = cnt;
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
 static void nilfs_btree_promote_key(struct nilfs_btree *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 key)
@@ -669,13 +767,13 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
                                nilfs_btree_node_get_key(btree, node, 0));
        if (move) {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index += lnchildren;
                path[level + 1].bp_index--;
        } else {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
                path[level].bp_index -= n;
        }
@@ -722,14 +820,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        path[level + 1].bp_index--;
        if (move) {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index -=
                        nilfs_btree_node_get_nchildren(btree, node);
                path[level + 1].bp_index++;
        } else {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
        }
@@ -781,7 +879,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
                *keyp = nilfs_btree_node_get_key(btree, right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
        } else {
@@ -790,7 +888,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
                *keyp = nilfs_btree_node_get_key(btree, right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
        }
@@ -897,12 +995,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        level = NILFS_BTREE_LEVEL_DATA;
        /* allocate a new ptr for data block */
-        if (btree->bt_ops->btop_find_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
                path[level].bp_newreq.bpr_ptr =
-                        btree->bt_ops->btop_find_target(btree, path, key);
+                        nilfs_btree_find_target_v(btree, path, key);
-        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                &btree->bt_bmap, &path[level].bp_newreq);
+                                           &path[level].bp_newreq);
        if (ret < 0)
                goto err_out_data;
@@ -924,8 +1022,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                if (pindex > 0) {
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex - 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -936,7 +1033,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                                stats->bs_nblocks++;
                                goto out;
                        } else
-                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                                brelse(bh);
                }
                /* right sibling */
@@ -944,8 +1041,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -956,19 +1052,19 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                                stats->bs_nblocks++;
                                goto out;
                        } else
-                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                                brelse(bh);
                }
                /* split */
                path[level].bp_newreq.bpr_ptr =
                        path[level - 1].bp_newreq.bpr_ptr + 1;
-                ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+                ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                        &btree->bt_bmap, &path[level].bp_newreq);
+                                                   &path[level].bp_newreq);
                if (ret < 0)
                        goto err_out_child_node;
-                ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+                ret = nilfs_btree_get_new_block(btree,
-                                               path[level].bp_newreq.bpr_ptr,
+                                                path[level].bp_newreq.bpr_ptr,
-                                               &bh);
+                                                &bh);
                if (ret < 0)
                        goto err_out_curr_node;
@@ -994,12 +1090,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* grow */
        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
-        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                &btree->bt_bmap, &path[level].bp_newreq);
+                                           &path[level].bp_newreq);
        if (ret < 0)
                goto err_out_child_node;
-        ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+        ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
-                                       path[level].bp_newreq.bpr_ptr, &bh);
+                                        &bh);
        if (ret < 0)
                goto err_out_curr_node;
@@ -1023,18 +1119,16 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
-                                                    &path[level].bp_newreq);
 err_out_child_node:
        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
-                nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                nilfs_btnode_delete(path[level].bp_sib_bh);
-                btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
+                nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
-                        &btree->bt_bmap, &path[level].bp_newreq);
+                                           &path[level].bp_newreq);
        }
-        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
-                                                       &path[level].bp_newreq);
 err_out_data:
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1049,14 +1143,12 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-        if (btree->bt_ops->btop_set_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
-                btree->bt_ops->btop_set_target(btree, key, ptr);
+                nilfs_btree_set_target_v(btree, key, ptr);
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
+                nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
-                        btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
+                                            &path[level - 1].bp_newreq);
-                                &btree->bt_bmap, &path[level - 1].bp_newreq);
-                }
                path[level].bp_op(btree, path, level, &key, &ptr);
        }
@@ -1153,7 +1245,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(btree, node, 0));
-        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level].bp_index += n;
 }
@@ -1192,7 +1284,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
                                nilfs_btree_node_get_key(btree, right, 0));
        path[level + 1].bp_index--;
-        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
 }
@@ -1221,7 +1313,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_bh);
        unlock_buffer(path[level].bp_sib_bh);
-        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
        path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
@@ -1252,7 +1344,7 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_bh);
        unlock_buffer(path[level].bp_sib_bh);
-        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        nilfs_btnode_delete(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level + 1].bp_index++;
 }
@@ -1276,7 +1368,7 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
        nilfs_btree_node_move_left(btree, root, child, n);
        unlock_buffer(path[level].bp_bh);
-        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = NULL;
 }
@@ -1300,12 +1392,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                path[level].bp_oldreq.bpr_ptr =
                        nilfs_btree_node_get_ptr(btree, node,
                                                 path[level].bp_index);
-                if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+                ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                        ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+                                                 &path[level].bp_oldreq);
-                                &btree->bt_bmap, &path[level].bp_oldreq);
+                if (ret < 0)
-                        if (ret < 0)
+                        goto err_out_child_node;
-                                goto err_out_child_node;
-                }
                if (nilfs_btree_node_get_nchildren(btree, node) >
                    nilfs_btree_node_nchildren_min(btree, node)) {
@@ -1321,8 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        /* left sibling */
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex - 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1343,8 +1432,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        /* right sibling */
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1381,12 +1469,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        node = nilfs_btree_get_root(btree);
        path[level].bp_oldreq.bpr_ptr =
                nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
-        if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
-                ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+        ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                        &btree->bt_bmap, &path[level].bp_oldreq);
+                                         &path[level].bp_oldreq);
-                if (ret < 0)
+        if (ret < 0)
-                        goto err_out_child_node;
+                goto err_out_child_node;
-        }
        /* child of the root node is deleted */
        path[level].bp_op = nilfs_btree_do_delete;
        stats->bs_nblocks++;
@@ -1398,15 +1486,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+        nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
-                btree->bt_bmap.b_pops->bpop_abort_end_ptr(
-                        &btree->bt_bmap, &path[level].bp_oldreq);
 err_out_child_node:
        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
-                if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+                nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
-                        btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+                                         &path[level].bp_oldreq);
-                                &btree->bt_bmap, &path[level].bp_oldreq);
        }
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1420,9 +1505,8 @@ static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
        int level;
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
+                nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
-                        btree->bt_bmap.b_pops->bpop_commit_end_ptr(
+                                          &path[level].bp_oldreq);
-                                &btree->bt_bmap, &path[level].bp_oldreq);
                path[level].bp_op(btree, path, level, NULL, NULL);
        }
@@ -1501,7 +1585,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
                if (nchildren > 1)
                        return 0;
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
-                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
@@ -1515,9 +1599,9 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
        nextmaxkey = (nchildren > 1) ?
                nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
        if (bh != NULL)
-                nilfs_bmap_put_block(bmap, bh);
+                brelse(bh);
-        return (maxkey == key) && (nextmaxkey < bmap->b_low);
+        return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
 }
 static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
@@ -1542,7 +1626,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
                nchildren = nilfs_btree_node_get_nchildren(btree, root);
                WARN_ON(nchildren > 1);
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
-                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
@@ -1563,7 +1647,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
        }
        if (bh != NULL)
-                nilfs_bmap_put_block(bmap, bh);
+                brelse(bh);
        return nitems;
 }
@@ -1584,10 +1668,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* for data */
        /* cannot find near ptr */
-        if (btree->bt_ops->btop_find_target != NULL)
+        if (NILFS_BMAP_USE_VBN(bmap))
-                dreq->bpr_ptr
+                dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
-                        = btree->bt_ops->btop_find_target(btree, NULL, key);
-        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
+        ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
        if (ret < 0)
                return ret;
@@ -1595,11 +1679,11 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        stats->bs_nblocks++;
        if (nreq != NULL) {
                nreq->bpr_ptr = dreq->bpr_ptr + 1;
-                ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
+                ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
                if (ret < 0)
                        goto err_out_dreq;
-                ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
+                ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh);
                if (ret < 0)
                        goto err_out_nreq;
@@ -1612,9 +1696,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* error */
 err_out_nreq:
-        bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, nreq);
 err_out_dreq:
-        bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, dreq);
        stats->bs_nblocks = 0;
        return ret;
@@ -1624,7 +1708,7 @@ static void
 nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                                      __u64 key, __u64 ptr,
                                      const __u64 *keys, const __u64 *ptrs,
-                                      int n, __u64 low, __u64 high,
+                                      int n,
                                      union nilfs_bmap_ptr_req *dreq,
                                      union nilfs_bmap_ptr_req *nreq,
                                      struct buffer_head *bh)
@@ -1642,12 +1726,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
        /* convert and insert */
        btree = (struct nilfs_btree *)bmap;
-        nilfs_btree_init(bmap, low, high);
+        nilfs_btree_init(bmap);
        if (nreq != NULL) {
-                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
-                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+                nilfs_bmap_commit_alloc_ptr(bmap, nreq);
-                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
-                }
                /* create child node at level 1 */
                lock_buffer(bh);
@@ -1661,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                        nilfs_bmap_set_dirty(bmap);
                unlock_buffer(bh);
-                nilfs_bmap_put_block(bmap, bh);
+                brelse(bh);
                /* create root node at level 2 */
                node = nilfs_btree_get_root(btree);
@@ -1669,8 +1751,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
                                      2, 1, &keys[0], &tmpptr);
        } else {
-                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
-                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
                /* create root node at level 1 */
                node = nilfs_btree_get_root(btree);
@@ -1682,8 +1763,8 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                        nilfs_bmap_set_dirty(bmap);
        }
-        if (btree->bt_ops->btop_set_target != NULL)
+        if (NILFS_BMAP_USE_VBN(bmap))
-                btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
+                nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr);
 }
 /**
@@ -1694,13 +1775,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 * @keys:
 * @ptrs:
 * @n:
- * @low:
- * @high:
 */
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
                                   __u64 key, __u64 ptr,
-                                   const __u64 *keys, const __u64 *ptrs,
+                                   const __u64 *keys, const __u64 *ptrs, int n)
-                                   int n, __u64 low, __u64 high)
 {
        struct buffer_head *bh;
        union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
@@ -1725,7 +1803,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
        if (ret < 0)
                return ret;
        nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
-                                              low, high, di, ni, bh);
+                                              di, ni, bh);
        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
        return 0;
 }
@@ -1754,9 +1832,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                nilfs_btree_node_get_ptr(btree, parent,
                                         path[level + 1].bp_index);
        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
-        ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
+        ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
-                                        &path[level].bp_oldreq,
+                                          &path[level].bp_oldreq,
-                                        &path[level].bp_newreq);
+                                          &path[level].bp_newreq);
        if (ret < 0)
                return ret;
@@ -1768,9 +1846,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
                        &path[level].bp_ctxt);
                if (ret < 0) {
-                        nilfs_bmap_abort_update(&btree->bt_bmap,
+                        nilfs_bmap_abort_update_v(&btree->bt_bmap,
-                                                &path[level].bp_oldreq,
+                                                  &path[level].bp_oldreq,
-                                                &path[level].bp_newreq);
+                                                  &path[level].bp_newreq);
                        return ret;
                }
        }
@@ -1784,9 +1862,9 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 {
        struct nilfs_btree_node *parent;
-        nilfs_bmap_commit_update(&btree->bt_bmap,
+        nilfs_bmap_commit_update_v(&btree->bt_bmap,
-                                 &path[level].bp_oldreq,
+                                   &path[level].bp_oldreq,
-                                 &path[level].bp_newreq);
+                                   &path[level].bp_newreq);
        if (buffer_nilfs_node(path[level].bp_bh)) {
                nilfs_btnode_commit_change_key(
@@ -1805,9 +1883,9 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
                                       struct nilfs_btree_path *path,
                                       int level)
 {
-        nilfs_bmap_abort_update(&btree->bt_bmap,
+        nilfs_bmap_abort_update_v(&btree->bt_bmap,
-                                &path[level].bp_oldreq,
+                                  &path[level].bp_oldreq,
-                                &path[level].bp_newreq);
+                                  &path[level].bp_newreq);
        if (buffer_nilfs_node(path[level].bp_bh))
                nilfs_btnode_abort_change_key(
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1930,7 +2008,9 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                goto out;
        }
-        ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
+        ret = NILFS_BMAP_USE_VBN(bmap) ?
+                nilfs_btree_propagate_v(btree, path, level, bh) :
+                nilfs_btree_propagate_p(btree, path, level, bh);
 out:
        nilfs_btree_clear_path(btree, path);
@@ -2066,12 +2146,9 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
        ptr = nilfs_btree_node_get_ptr(btree, parent,
                                       path[level + 1].bp_index);
        req.bpr_ptr = ptr;
-        ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
+        ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
-                                                               &req);
+        if (unlikely(ret < 0))
-        if (ret < 0)
                return ret;
-        btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
-                                                        &req, blocknr);
        key = nilfs_btree_node_get_key(btree, parent,
                                       path[level + 1].bp_index);
@@ -2114,8 +2191,9 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
                goto out;
        }
-        ret = btree->bt_ops->btop_assign(btree, path, level, bh,
+        ret = NILFS_BMAP_USE_VBN(bmap) ?
-                                            blocknr, binfo);
+                nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
+                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 out:
        nilfs_btree_clear_path(btree, path);
@@ -2171,7 +2249,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
                WARN_ON(ret == -ENOENT);
                goto out;
        }
-        ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
+        ret = nilfs_btree_get_block(btree, ptr, &bh);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
@@ -2179,7 +2257,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        if (!buffer_dirty(bh))
                nilfs_btnode_mark_dirty(bh);
-        nilfs_bmap_put_block(&btree->bt_bmap, bh);
+        brelse(bh);
        if (!nilfs_bmap_dirty(&btree->bt_bmap))
                nilfs_bmap_set_dirty(&btree->bt_bmap);
@@ -2191,6 +2269,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 static const struct nilfs_bmap_operations nilfs_btree_ops = {
        .bop_lookup             =       nilfs_btree_lookup,
+        .bop_lookup_contig      =       nilfs_btree_lookup_contig,
        .bop_insert             =       nilfs_btree_insert,
        .bop_delete             =       nilfs_btree_delete,
        .bop_clear              =       NULL,
@@ -2210,6 +2289,7 @@ static const struct nilfs_bmap_operations nilfs_btree_ops = {
 static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
        .bop_lookup             =       NULL,
+        .bop_lookup_contig      =       NULL,
        .bop_insert             =       NULL,
        .bop_delete             =       NULL,
        .bop_clear              =       NULL,
@@ -2227,43 +2307,13 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
        .bop_gather_data        =       NULL,
 };
-static const struct nilfs_btree_operations nilfs_btree_ops_v = {
+int nilfs_btree_init(struct nilfs_bmap *bmap)
-        .btop_find_target       =       nilfs_btree_find_target_v,
-        .btop_set_target        =       nilfs_btree_set_target_v,
-        .btop_propagate         =       nilfs_btree_propagate_v,
-        .btop_assign            =       nilfs_btree_assign_v,
-};
-static const struct nilfs_btree_operations nilfs_btree_ops_p = {
-        .btop_find_target       =       NULL,
-        .btop_set_target        =       NULL,
-        .btop_propagate         =       nilfs_btree_propagate_p,
-        .btop_assign            =       nilfs_btree_assign_p,
-};
-int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
 {
-        struct nilfs_btree *btree;
-        btree = (struct nilfs_btree *)bmap;
        bmap->b_ops = &nilfs_btree_ops;
-        bmap->b_low = low;
-        bmap->b_high = high;
-        switch (bmap->b_inode->i_ino) {
-        case NILFS_DAT_INO:
-                btree->bt_ops = &nilfs_btree_ops_p;
-                break;
-        default:
-                btree->bt_ops = &nilfs_btree_ops_v;
-                break;
-        }
        return 0;
 }
 void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
 {
-        bmap->b_low = NILFS_BMAP_LARGE_LOW;
-        bmap->b_high = NILFS_BMAP_LARGE_HIGH;
        bmap->b_ops = &nilfs_btree_ops_gc;
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4766deb52fb1..0e72bbbc6b64 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
 struct nilfs_btree_path;
 /**
- * struct nilfs_btree_operations - B-tree operation table
- */
-struct nilfs_btree_operations {
-        __u64 (*btop_find_target)(const struct nilfs_btree *,
-                                  const struct nilfs_btree_path *, __u64);
-        void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
-        struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
-        int (*btop_propagate)(struct nilfs_btree *,
-                              struct nilfs_btree_path *,
-                              int,
-                              struct buffer_head *);
-        int (*btop_assign)(struct nilfs_btree *,
-                           struct nilfs_btree_path *,
-                           int,
-                           struct buffer_head **,
-                           sector_t,
-                           union nilfs_binfo *);
-};
-/**
 * struct nilfs_btree_node - B-tree node
 * @bn_flags: flags
 * @bn_level: level
@@ -80,13 +58,9 @@ struct nilfs_btree_node {
 /**
 * struct nilfs_btree - B-tree structure
 * @bt_bmap: bmap base structure
- * @bt_ops: B-tree operation table
 */
 struct nilfs_btree {
        struct nilfs_bmap bt_bmap;
-        /* B-tree-specific members */
-        const struct nilfs_btree_operations *bt_ops;
 };
@@ -108,10 +82,9 @@ struct nilfs_btree {
 int nilfs_btree_path_cache_init(void);
 void nilfs_btree_path_cache_destroy(void);
-int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_btree_init(struct nilfs_bmap *);
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
-                                   const __u64 *, const __u64 *,
+                                   const __u64 *, const __u64 *, int);
-                                   int, __u64, __u64);
 void nilfs_btree_init_gc(struct nilfs_bmap *);
 #endif  /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 300f1cdfa862..aec942cf79e3 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -295,10 +295,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                return -EINVAL;
        }
-        /* cannot delete the latest checkpoint */
-        if (start == nilfs_mdt_cno(cpfile) - 1)
-                return -EPERM;
        down_write(&NILFS_MDT(cpfile)->mi_sem);
        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
@@ -311,7 +307,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
                if (ret < 0) {
                        if (ret != -ENOENT)
-                                goto out_header;
+                                break;
                        /* skip hole */
                        ret = 0;
                        continue;
@@ -344,7 +340,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                                        continue;
                                printk(KERN_ERR "%s: cannot delete block\n",
                                       __func__);
-                                goto out_header;
+                                break;
                        }
                }
@@ -362,7 +358,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                kunmap_atomic(kaddr, KM_USER0);
        }
- out_header:
        brelse(header_bh);
 out_sem:
@@ -384,9 +379,10 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
 }
 static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
-                                          struct nilfs_cpinfo *ci, size_t nci)
+                                          void *buf, unsigned cisz, size_t nci)
 {
        struct nilfs_checkpoint *cp;
+        struct nilfs_cpinfo *ci = buf;
        struct buffer_head *bh;
        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
        __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
@@ -410,17 +406,22 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
                kaddr = kmap_atomic(bh->b_page, KM_USER0);
                cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
                for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
-                        if (!nilfs_checkpoint_invalid(cp))
+                        if (!nilfs_checkpoint_invalid(cp)) {
-                                nilfs_cpfile_checkpoint_to_cpinfo(
+                                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp,
-                                        cpfile, cp, &ci[n++]);
+                                                                  ci);
+                                ci = (void *)ci + cisz;
+                                n++;
+                        }
                }
                kunmap_atomic(kaddr, KM_USER0);
                brelse(bh);
        }
        ret = n;
-        if (n > 0)
+        if (n > 0) {
-                *cnop = ci[n - 1].ci_cno + 1;
+                ci = (void *)ci - cisz;
+                *cnop = ci->ci_cno + 1;
+        }
 out:
        up_read(&NILFS_MDT(cpfile)->mi_sem);
@@ -428,11 +429,12 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 }
 static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
-                                          struct nilfs_cpinfo *ci, size_t nci)
+                                          void *buf, unsigned cisz, size_t nci)
 {
        struct buffer_head *bh;
        struct nilfs_cpfile_header *header;
        struct nilfs_checkpoint *cp;
+        struct nilfs_cpinfo *ci = buf;
        __u64 curr = *cnop, next;
        unsigned long curr_blkoff, next_blkoff;
        void *kaddr;
@@ -472,7 +474,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
                if (unlikely(nilfs_checkpoint_invalid(cp) ||
                             !nilfs_checkpoint_snapshot(cp)))
                        break;
-                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
+                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci);
+                ci = (void *)ci + cisz;
+                n++;
                next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
                if (next == 0)
                        break; /* reach end of the snapshot list */
@@ -511,13 +515,13 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 */
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
-                                struct nilfs_cpinfo *ci, size_t nci)
+                                void *buf, unsigned cisz, size_t nci)
 {
        switch (mode) {
        case NILFS_CHECKPOINT:
-                return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
+                return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci);
        case NILFS_SNAPSHOT:
-                return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
+                return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci);
        default:
                return -EINVAL;
        }
@@ -533,20 +537,14 @@ int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
        struct nilfs_cpinfo ci;
        __u64 tcno = cno;
        ssize_t nci;
-        int ret;
-        nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
+        nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1);
        if (nci < 0)
                return nci;
        else if (nci == 0 || ci.ci_cno != cno)
                return -ENOENT;
+        else if (nilfs_cpinfo_snapshot(&ci))
-        /* cannot delete the latest checkpoint nor snapshots */
+                return -EBUSY;
-        ret = nilfs_cpinfo_snapshot(&ci);
-        if (ret < 0)
-                return ret;
-        else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
-                return -EPERM;
        return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
 }
@@ -864,11 +862,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
        case NILFS_CHECKPOINT:
                /*
                 * Check for protecting existing snapshot mounts:
-                 * bd_mount_sem is used to make this operation atomic and
+                 * ns_mount_mutex is used to make this operation atomic and
                 * exclusive with a new mount job.  Though it doesn't cover
                 * umount, it's enough for the purpose.
                 */
-                down(&nilfs->ns_bdev->bd_mount_sem);
+                mutex_lock(&nilfs->ns_mount_mutex);
                if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
                        /* Current implementation does not have to protect
                           plain read-only mounts since they are exclusive
@@ -877,7 +875,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
                        ret = -EBUSY;
                } else
                        ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
-                up(&nilfs->ns_bdev->bd_mount_sem);
+                mutex_unlock(&nilfs->ns_mount_mutex);
                return ret;
        case NILFS_SNAPSHOT:
                return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 1a8a1008c342..788a45950197 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -39,7 +39,7 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
 int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
 int nilfs_cpfile_is_snapshot(struct inode *, __u64);
 int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
-ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
-                                struct nilfs_cpinfo *, size_t);
+                                size_t);
 #endif  /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index bb8a5818e7f1..8927ca27e6f7 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -92,21 +92,6 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
        nilfs_palloc_abort_alloc_entry(dat, req);
 }
-int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
-        int ret;
-        ret = nilfs_palloc_prepare_free_entry(dat, req);
-        if (ret < 0)
-                return ret;
-        ret = nilfs_dat_prepare_entry(dat, req, 0);
-        if (ret < 0) {
-                nilfs_palloc_abort_free_entry(dat, req);
-                return ret;
-        }
-        return 0;
-}
 void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
 {
        struct nilfs_dat_entry *entry;
@@ -149,15 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
                                             req->pr_entry_bh, kaddr);
        entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
-        if (entry->de_blocknr != cpu_to_le64(0) ||
-            entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
-                printk(KERN_CRIT
-                       "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
-                       __func__, (unsigned long long)req->pr_entry_nr,
-                       (unsigned long long)le64_to_cpu(entry->de_start),
-                       (unsigned long long)le64_to_cpu(entry->de_end),
-                       (unsigned long long)le64_to_cpu(entry->de_blocknr));
-        }
        entry->de_blocknr = cpu_to_le64(blocknr);
        kunmap_atomic(kaddr, KM_USER0);
@@ -391,36 +367,37 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
        return ret;
 }
-ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
                            size_t nvi)
 {
        struct buffer_head *entry_bh;
        struct nilfs_dat_entry *entry;
+        struct nilfs_vinfo *vinfo = buf;
        __u64 first, last;
        void *kaddr;
        unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
        int i, j, n, ret;
        for (i = 0; i < nvi; i += n) {
-                ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
+                ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr,
                                                   0, &entry_bh);
                if (ret < 0)
                        return ret;
                kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
                /* last virtual block number in this block */
-                first = vinfo[i].vi_vblocknr;
+                first = vinfo->vi_vblocknr;
                do_div(first, entries_per_block);
                first *= entries_per_block;
                last = first + entries_per_block - 1;
                for (j = i, n = 0;
-                     j < nvi && vinfo[j].vi_vblocknr >= first &&
+                     j < nvi && vinfo->vi_vblocknr >= first &&
-                             vinfo[j].vi_vblocknr <= last;
+                             vinfo->vi_vblocknr <= last;
-                     j++, n++) {
+                     j++, n++, vinfo = (void *)vinfo + visz) {
                        entry = nilfs_palloc_block_get_entry(
-                                dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
+                                dat, vinfo->vi_vblocknr, entry_bh, kaddr);
-                        vinfo[j].vi_start = le64_to_cpu(entry->de_start);
+                        vinfo->vi_start = le64_to_cpu(entry->de_start);
-                        vinfo[j].vi_end = le64_to_cpu(entry->de_end);
+                        vinfo->vi_end = le64_to_cpu(entry->de_end);
-                        vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
+                        vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
                }
                kunmap_atomic(kaddr, KM_USER0);
                brelse(entry_bh);
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d9560654a4b7..d328b81eead4 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -47,6 +47,6 @@ void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
-ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
 #endif  /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 54100acc1102..1a4fa04cf071 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -43,7 +43,6 @@
 */
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include "nilfs.h"
 #include "page.h"
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index c6379e482781..342d9765df8d 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -25,6 +25,7 @@
 #include "page.h"
 #include "direct.h"
 #include "alloc.h"
+#include "dat.h"
 static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
 {
@@ -62,6 +63,47 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
        return 0;
 }
+static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
+                                      __u64 key, __u64 *ptrp,
+                                      unsigned maxblocks)
+{
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+        struct inode *dat = NULL;
+        __u64 ptr, ptr2;
+        sector_t blocknr;
+        int ret, cnt;
+        if (key > NILFS_DIRECT_KEY_MAX ||
+            (ptr = nilfs_direct_get_ptr(direct, key)) ==
+            NILFS_BMAP_INVALID_PTR)
+                return -ENOENT;
+        if (NILFS_BMAP_USE_VBN(bmap)) {
+                dat = nilfs_bmap_get_dat(bmap);
+                ret = nilfs_dat_translate(dat, ptr, &blocknr);
+                if (ret < 0)
+                        return ret;
+                ptr = blocknr;
+        }
+        maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+        for (cnt = 1; cnt < maxblocks &&
+                     (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
+                     NILFS_BMAP_INVALID_PTR;
+             cnt++) {
+                if (dat) {
+                        ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+                        if (ret < 0)
+                                return ret;
+                        ptr2 = blocknr;
+                }
+                if (ptr2 != ptr + cnt)
+                        break;
+        }
+        *ptrp = ptr;
+        return cnt;
+}
 static __u64
 nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
 {
@@ -90,10 +132,9 @@ static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
 {
        int ret;
-        if (direct->d_ops->dop_find_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
+                req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
-        ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
+        ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
-                                                               req);
        if (ret < 0)
                return ret;
@@ -111,16 +152,14 @@ static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
        bh = (struct buffer_head *)((unsigned long)ptr);
        set_buffer_nilfs_volatile(bh);
-        if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
+        nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
-                direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
-                        &direct->d_bmap, req);
        nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
        if (!nilfs_bmap_dirty(&direct->d_bmap))
                nilfs_bmap_set_dirty(&direct->d_bmap);
-        if (direct->d_ops->dop_set_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
+                nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
 }
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -152,25 +191,18 @@ static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
 {
        int ret;
-        if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+        req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
-                req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+        ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
-                ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
+        if (!ret)
-                        &direct->d_bmap, req);
+                stats->bs_nblocks = 1;
-                if (ret < 0)
+        return ret;
-                        return ret;
-        }
-        stats->bs_nblocks = 1;
-        return 0;
 }
 static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
                                       union nilfs_bmap_ptr_req *req,
                                       __u64 key)
 {
-        if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
+        nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
-                direct->d_bmap.b_pops->bpop_commit_end_ptr(
-                        &direct->d_bmap, req);
        nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
 }
@@ -244,8 +276,7 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
 }
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
-                                    __u64 key, __u64 *keys, __u64 *ptrs,
+                                    __u64 key, __u64 *keys, __u64 *ptrs, int n)
-                                    int n, __u64 low, __u64 high)
 {
        struct nilfs_direct *direct;
        __le64 *dptrs;
@@ -275,8 +306,7 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
                        dptrs[i] = NILFS_BMAP_INVALID_PTR;
        }
-        nilfs_direct_init(bmap, low, high);
+        nilfs_direct_init(bmap);
        return 0;
 }
@@ -293,11 +323,11 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
        if (!buffer_nilfs_volatile(bh)) {
                oldreq.bpr_ptr = ptr;
                newreq.bpr_ptr = ptr;
-                ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
+                ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
-                                                &newreq);
+                                                  &newreq);
                if (ret < 0)
                        return ret;
-                nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
+                nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
                set_buffer_nilfs_volatile(bh);
                nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
        } else
@@ -309,12 +339,10 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
 static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
                                  struct buffer_head *bh)
 {
-        struct nilfs_direct *direct;
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
-        direct = (struct nilfs_direct *)bmap;
+        return NILFS_BMAP_USE_VBN(bmap) ?
-        return (direct->d_ops->dop_propagate != NULL) ?
+                nilfs_direct_propagate_v(direct, bh) : 0;
-                direct->d_ops->dop_propagate(direct, bh) :
-                0;
 }
 static int nilfs_direct_assign_v(struct nilfs_direct *direct,
@@ -327,12 +355,9 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
        int ret;
        req.bpr_ptr = ptr;
-        ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
+        ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
-                &direct->d_bmap, &req);
+        if (unlikely(ret < 0))
-        if (ret < 0)
                return ret;
-        direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
-                                                     &req, blocknr);
        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -377,12 +402,14 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
                return -EINVAL;
        }
-        return direct->d_ops->dop_assign(direct, key, ptr, bh,
+        return NILFS_BMAP_USE_VBN(bmap) ?
-                                         blocknr, binfo);
+                nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) :
+                nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo);
 }
 static const struct nilfs_bmap_operations nilfs_direct_ops = {
        .bop_lookup             =       nilfs_direct_lookup,
+        .bop_lookup_contig      =       nilfs_direct_lookup_contig,
        .bop_insert             =       nilfs_direct_insert,
        .bop_delete             =       nilfs_direct_delete,
        .bop_clear              =       NULL,
@@ -401,36 +428,8 @@ static const struct nilfs_bmap_operations nilfs_direct_ops = {
 };
-static const struct nilfs_direct_operations nilfs_direct_ops_v = {
+int nilfs_direct_init(struct nilfs_bmap *bmap)
-        .dop_find_target        =       nilfs_direct_find_target_v,
-        .dop_set_target         =       nilfs_direct_set_target_v,
-        .dop_propagate          =       nilfs_direct_propagate_v,
-        .dop_assign             =       nilfs_direct_assign_v,
-};
-static const struct nilfs_direct_operations nilfs_direct_ops_p = {
-        .dop_find_target        =       NULL,
-        .dop_set_target         =       NULL,
-        .dop_propagate          =       NULL,
-        .dop_assign             =       nilfs_direct_assign_p,
-};
-int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
 {
-        struct nilfs_direct *direct;
-        direct = (struct nilfs_direct *)bmap;
        bmap->b_ops = &nilfs_direct_ops;
-        bmap->b_low = low;
-        bmap->b_high = high;
-        switch (bmap->b_inode->i_ino) {
-        case NILFS_DAT_INO:
-                direct->d_ops = &nilfs_direct_ops_p;
-                break;
-        default:
-                direct->d_ops = &nilfs_direct_ops_v;
-                break;
-        }
        return 0;
 }
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index 45d2c5cda812..a5ffd66e25d0 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -31,18 +31,6 @@
 struct nilfs_direct;
 /**
- * struct nilfs_direct_operations - direct mapping operation table
- */
-struct nilfs_direct_operations {
-        __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
-        void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
-        int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
-        int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
-                          struct buffer_head **, sector_t,
-                          union nilfs_binfo *);
-};
-/**
 * struct nilfs_direct_node - direct node
 * @dn_flags: flags
 * @dn_pad: padding
@@ -55,13 +43,9 @@ struct nilfs_direct_node {
 /**
 * struct nilfs_direct - direct mapping
 * @d_bmap: bmap structure
- * @d_ops: direct mapping operation table
 */
 struct nilfs_direct {
        struct nilfs_bmap d_bmap;
-        /* direct-mapping-specific members */
-        const struct nilfs_direct_operations *d_ops;
 };
@@ -70,9 +54,9 @@ struct nilfs_direct {
 #define NILFS_DIRECT_KEY_MAX    (NILFS_DIRECT_NBLOCKS - 1)
-int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_direct_init(struct nilfs_bmap *);
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
-                                    __u64 *, int, __u64, __u64);
+                                    __u64 *, int);
 #endif  /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 19d2102b6a69..1b3c2bb20da9 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,8 +52,9 @@
 #include "dat.h"
 #include "ifile.h"
-static struct address_space_operations def_gcinode_aops = {};
+static struct address_space_operations def_gcinode_aops = {
-/* XXX need def_gcinode_iops/fops? */
+        .sync_page              = block_sync_page,
+};
 /*
 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 49ab4a49bb4f..fe9d8f2a13f8 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -43,22 +43,23 @@
 *
 * This function does not issue actual read request of the specified data
 * block. It is done by VFS.
- * Bulk read for direct-io is not supported yet. (should be supported)
 */
 int nilfs_get_block(struct inode *inode, sector_t blkoff,
                    struct buffer_head *bh_result, int create)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        unsigned long blknum = 0;
+        __u64 blknum = 0;
        int err = 0, ret;
        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
-        /* This exclusion control is a workaround; should be revised */
+        down_read(&NILFS_MDT(dat)->mi_sem);
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
-        ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
+        up_read(&NILFS_MDT(dat)->mi_sem);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        if (ret >= 0) { /* found */
-        if (ret == 0) { /* found */
                map_bh(bh_result, inode->i_sb, blknum);
+                if (ret > 0)
+                        bh_result->b_size = (ret << inode->i_blkbits);
                goto out;
        }
        /* data block was not found */
@@ -240,7 +241,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 struct address_space_operations nilfs_aops = {
        .writepage              = nilfs_writepage,
        .readpage               = nilfs_readpage,
-        /* .sync_page           = nilfs_sync_page, */
+        .sync_page              = block_sync_page,
        .writepages             = nilfs_writepages,
        .set_page_dirty         = nilfs_set_page_dirty,
        .readpages              = nilfs_readpages,
@@ -249,6 +250,7 @@ struct address_space_operations nilfs_aops = {
        /* .releasepage         = nilfs_releasepage, */
        .invalidatepage         = block_invalidatepage,
        .direct_IO              = nilfs_direct_IO,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 struct inode *nilfs_new_inode(struct inode *dir, int mode)
@@ -307,10 +309,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        /* ii->i_file_acl = 0; */
        /* ii->i_dir_acl = 0; */
        ii->i_dir_start_lookup = 0;
-#ifdef CONFIG_NILFS_FS_POSIX_ACL
-        ii->i_acl = NULL;
-        ii->i_default_acl = NULL;
-#endif
        ii->i_cno = 0;
        nilfs_set_inode_flags(inode);
        spin_lock(&sbi->s_next_gen_lock);
@@ -432,10 +430,6 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
-#ifdef CONFIG_NILFS_FS_POSIX_ACL
-        ii->i_acl = NILFS_ACL_NOT_CACHED;
-        ii->i_default_acl = NILFS_ACL_NOT_CACHED;
-#endif
        if (nilfs_read_inode_common(inode, raw_inode))
                goto failed_unmap;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6759b92006f..6ea5f872e2de 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -152,7 +152,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        down_read(&nilfs->ns_segctor_sem);
        ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
-                                      nmembs);
+                                      size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -182,7 +182,8 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
+        ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size,
+                                      nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -212,7 +213,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
+        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -435,24 +436,6 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
        return nmembs;
 }
-static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
-                                     struct nilfs_argv *argv, void *buf)
-{
-        size_t nmembs = argv->v_nmembs;
-        struct nilfs_sb_info *sbi = nilfs->ns_writer;
-        int ret;
-        if (unlikely(!sbi)) {
-                /* never happens because called for a writable mount */
-                WARN_ON(1);
-                return -EROFS;
-        }
-        ret = nilfs_segctor_add_segments_to_be_freed(
-                NILFS_SC(sbi), buf, nmembs);
-        return (ret < 0) ? ret : nmembs;
-}
 int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
                                       struct nilfs_argv *argv, void **kbufs)
 {
@@ -491,14 +474,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
                msg = "cannot mark copying blocks dirty";
                goto failed;
        }
-        ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]);
-        if (ret < 0) {
-                /*
-                 * can safely abort because this operation is atomic.
-                 */
-                msg = "cannot set segments to be freed";
-                goto failed;
-        }
        return 0;
 failed:
@@ -615,7 +590,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
        if (copy_from_user(&argv, argp, sizeof(argv)))
                return -EFAULT;
-        if (argv.v_size != membsz)
+        if (argv.v_size < membsz)
                return -EINVAL;
        ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index bb78745a0e30..2dfd47714ae5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -412,8 +412,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
                return 0; /* Do not request flush for shadow page cache */
        if (!sb) {
                writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
-                if (!writer)
+                if (!writer) {
+                        nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
                        return -EROFS;
+                }
                sb = writer->s_super;
        }
@@ -430,6 +432,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 static struct address_space_operations def_mdt_aops = {
        .writepage              = nilfs_mdt_write_page,
+        .sync_page              = block_sync_page,
 };
 static struct inode_operations def_mdt_iops;
@@ -449,7 +452,7 @@ struct inode *
 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
                     ino_t ino, gfp_t gfp_mask)
 {
-        struct inode *inode = nilfs_alloc_inode(sb);
+        struct inode *inode = nilfs_alloc_inode_common(nilfs);
        if (!inode)
                return NULL;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index da6fc0bba2e5..724c63766e82 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -58,10 +58,6 @@ struct nilfs_inode_info {
         */
        struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_NILFS_POSIX_ACL
-        struct posix_acl *i_acl;
-        struct posix_acl *i_default_acl;
-#endif
        struct buffer_head *i_bh;       /* i_bh contains a new or dirty
                                           disk inode */
        struct inode vfs_inode;
@@ -263,6 +259,7 @@ extern void nilfs_dirty_inode(struct inode *);
 extern struct dentry *nilfs_get_parent(struct dentry *);
 /* super.c */
+extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
 extern void nilfs_error(struct super_block *, const char *, const char *, ...)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 57afa9d24061..d80cc71be749 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -28,7 +28,6 @@
 #include "segment.h"
 #include "sufile.h"
 #include "page.h"
-#include "seglist.h"
 #include "segbuf.h"
 /*
@@ -395,6 +394,24 @@ static void dispose_recovery_list(struct list_head *head)
        }
 }
+struct nilfs_segment_entry {
+        struct list_head        list;
+        __u64                   segnum;
+};
+static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
+{
+        struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+        if (unlikely(!ent))
+                return -ENOMEM;
+        ent->segnum = segnum;
+        INIT_LIST_HEAD(&ent->list);
+        list_add_tail(&ent->list, head);
+        return 0;
+}
 void nilfs_dispose_segment_list(struct list_head *head)
 {
        while (!list_empty(head)) {
@@ -402,7 +419,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
                        = list_entry(head->next,
                                     struct nilfs_segment_entry, list);
                list_del(&ent->list);
-                nilfs_free_segment_entry(ent);
+                kfree(ent);
        }
 }
@@ -431,12 +448,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
        if (unlikely(err))
                goto failed;
-        err = -ENOMEM;
        for (i = 1; i < 4; i++) {
-                ent = nilfs_alloc_segment_entry(segnum[i]);
+                err = nilfs_segment_list_add(head, segnum[i]);
-                if (unlikely(!ent))
+                if (unlikely(err))
                        goto failed;
-                list_add_tail(&ent->list, head);
        }
        /*
@@ -450,7 +465,7 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
                                goto failed;
                }
                list_del(&ent->list);
-                nilfs_free_segment_entry(ent);
+                kfree(ent);
        }
        /* Allocate new segments for recovery */
@@ -791,7 +806,6 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        __u64 cno;
-        struct nilfs_segment_entry *ent;
        LIST_HEAD(segments);
        int empty_seg = 0, scan_newer = 0;
        int ret;
@@ -892,12 +906,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                if (empty_seg++)
                        goto super_root_found; /* found a valid super root */
-                ent = nilfs_alloc_segment_entry(segnum);
+                ret = nilfs_segment_list_add(&segments, segnum);
-                if (unlikely(!ent)) {
+                if (unlikely(ret))
-                        ret = -ENOMEM;
                        goto failed;
-                }
-                list_add_tail(&ent->list, &segments);
                seg_seq++;
                segnum = nextnum;
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index adccd4fc654e..0776ccc2504a 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -60,6 +60,7 @@ struct nilfs_sb_info {
        struct super_block *s_super;    /* reverse pointer to super_block */
        struct the_nilfs *s_nilfs;
        struct list_head s_list;        /* list head for nilfs->ns_supers */
+        atomic_t s_count;               /* reference count */
        /* Segment constructor */
        struct list_head s_dirty_files; /* dirty files list */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1e68821b4a9b..9e3fe17bb96b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -26,7 +26,6 @@
 #include <linux/crc32.h>
 #include "page.h"
 #include "segbuf.h"
-#include "seglist.h"
 static struct kmem_cache *nilfs_segbuf_cachep;
@@ -394,7 +393,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                 * Last BIO is always sent through the following
                 * submission.
                 */
-                rw |= (1 << BIO_RW_SYNCIO);
+                rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
                res = nilfs_submit_seg_bio(wi, rw);
                if (unlikely(res))
                        goto failed_bio;
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
deleted file mode 100644
index d39df9144e99..000000000000
--- a/fs/nilfs2/seglist.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * seglist.h - expediential structure and routines to handle list of segments
- *             (would be removed in a future release)
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
- *
- */
-#ifndef _NILFS_SEGLIST_H
-#define _NILFS_SEGLIST_H
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
-#include "sufile.h"
-struct nilfs_segment_entry {
-        __u64                   segnum;
-#define NILFS_SLH_FREED         0x0001  /* The segment was freed provisonally.
-                                           It must be cancelled if
-                                           construction aborted */
-        unsigned                flags;
-        struct list_head        list;
-        struct buffer_head     *bh_su;
-        struct nilfs_segment_usage *raw_su;
-};
-void nilfs_dispose_segment_list(struct list_head *);
-static inline struct nilfs_segment_entry *
-nilfs_alloc_segment_entry(__u64 segnum)
-{
-        struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
-        if (likely(ent)) {
-                ent->segnum = segnum;
-                ent->flags = 0;
-                ent->bh_su = NULL;
-                ent->raw_su = NULL;
-                INIT_LIST_HEAD(&ent->list);
-        }
-        return ent;
-}
-static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
-                                           struct inode *sufile)
-{
-        return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
-                                              &ent->raw_su, &ent->bh_su);
-}
-static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
-                                             struct inode *sufile)
-{
-        if (!ent->bh_su)
-                return;
-        nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
-        ent->bh_su = NULL;
-        ent->raw_su = NULL;
-}
-static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
-{
-        kfree(ent);
-}
-#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 22c7f65c2403..51ff3d0a4ee2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -39,7 +39,6 @@
 #include "sufile.h"
 #include "cpfile.h"
 #include "ifile.h"
-#include "seglist.h"
 #include "segbuf.h"
@@ -79,7 +78,8 @@ enum {
 /* State flags of collection */
 #define NILFS_CF_NODE           0x0001  /* Collecting node blocks */
 #define NILFS_CF_IFILE_STARTED  0x0002  /* IFILE stage has started */
-#define NILFS_CF_HISTORY_MASK   (NILFS_CF_IFILE_STARTED)
+#define NILFS_CF_SUFREED        0x0004  /* segment usages has been freed */
+#define NILFS_CF_HISTORY_MASK   (NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED)
 /* Operations depending on the construction mode and file type */
 struct nilfs_sc_operations {
@@ -810,7 +810,7 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
 {
        return list_empty(&sci->sc_dirty_files) &&
                !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
-                list_empty(&sci->sc_cleaning_segments) &&
+                sci->sc_nfreesegs == 0 &&
                (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
 }
@@ -1005,44 +1005,6 @@ static void nilfs_drop_collected_inodes(struct list_head *head)
        }
 }
-static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
-                                               struct inode *sufile)
-{
-        struct list_head *head = &sci->sc_cleaning_segments;
-        struct nilfs_segment_entry *ent;
-        int err;
-        list_for_each_entry(ent, head, list) {
-                if (!(ent->flags & NILFS_SLH_FREED))
-                        break;
-                err = nilfs_sufile_cancel_free(sufile, ent->segnum);
-                WARN_ON(err); /* do not happen */
-                ent->flags &= ~NILFS_SLH_FREED;
-        }
-}
-static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
-                                               struct inode *sufile)
-{
-        struct list_head *head = &sci->sc_cleaning_segments;
-        struct nilfs_segment_entry *ent;
-        int err;
-        list_for_each_entry(ent, head, list) {
-                err = nilfs_sufile_free(sufile, ent->segnum);
-                if (unlikely(err))
-                        return err;
-                ent->flags |= NILFS_SLH_FREED;
-        }
-        return 0;
-}
-static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
-{
-        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
-}
 static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
                                       struct inode *inode,
                                       struct list_head *listp,
@@ -1161,6 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct list_head *head;
        struct nilfs_inode_info *ii;
+        size_t ndone;
        int err = 0;
        switch (sci->sc_stage.scnt) {
@@ -1250,10 +1213,16 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                        break;
                sci->sc_stage.scnt++;  /* Fall through */
        case NILFS_ST_SUFILE:
-                err = nilfs_segctor_prepare_free_segments(sci,
+                err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
-                                                          nilfs->ns_sufile);
+                                         sci->sc_nfreesegs, &ndone);
-                if (unlikely(err))
+                if (unlikely(err)) {
+                        nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                  sci->sc_freesegs, ndone,
+                                                  NULL);
                        break;
+                }
+                sci->sc_stage.flags |= NILFS_CF_SUFREED;
                err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
@@ -1486,7 +1455,15 @@ static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
 {
        if (unlikely(err)) {
                nilfs_segctor_free_incomplete_segments(sci, nilfs);
-                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+                        int ret;
+                        ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                        sci->sc_freesegs,
+                                                        sci->sc_nfreesegs,
+                                                        NULL);
+                        WARN_ON(ret); /* do not happen */
+                }
        }
        nilfs_segctor_clear_segment_buffers(sci);
 }
@@ -1585,7 +1562,13 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
                        break;
-                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+                        err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                        sci->sc_freesegs,
+                                                        sci->sc_nfreesegs,
+                                                        NULL);
+                        WARN_ON(err); /* do not happen */
+                }
                nilfs_segctor_clear_segment_buffers(sci);
                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
@@ -1846,26 +1829,13 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci,
                err = nilfs_segbuf_write(segbuf, &wi);
                res = nilfs_segbuf_wait(segbuf, &wi);
-                err = unlikely(err) ? : res;
+                err = err ? : res;
-                if (unlikely(err))
+                if (err)
                        return err;
        }
        return 0;
 }
-static int nilfs_page_has_uncleared_buffer(struct page *page)
-{
-        struct buffer_head *head, *bh;
-        head = bh = page_buffers(page);
-        do {
-                if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
-                        return 1;
-                bh = bh->b_this_page;
-        } while (bh != head);
-        return 0;
-}
 static void __nilfs_end_page_io(struct page *page, int err)
 {
        if (!err) {
@@ -1889,13 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err)
        if (!page)
                return;
-        if (buffer_nilfs_node(page_buffers(page)) &&
+        if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
-            nilfs_page_has_uncleared_buffer(page))
+                /*
-                /* For b-tree node pages, this function may be called twice
+                 * For b-tree node pages, this function may be called twice
-                   or more because they might be split in a segment.
+                 * or more because they might be split in a segment.
-                   This check assures that cleanup has been done for all
+                 */
-                   buffers in a split btnode page. */
+                if (PageDirty(page)) {
+                        /*
+                         * For pages holding split b-tree node buffers, dirty
+                         * flag on the buffers may be cleared discretely.
+                         * In that case, the page is once redirtied for
+                         * remaining buffers, and it must be cancelled if
+                         * all the buffers get cleaned later.
+                         */
+                        lock_page(page);
+                        if (nilfs_page_buffers_clean(page))
+                                __nilfs_clear_page_dirty(page);
+                        unlock_page(page);
+                }
                return;
+        }
        __nilfs_end_page_io(page, err);
 }
@@ -1957,7 +1940,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
                        }
                        if (bh->b_page != fs_page) {
                                nilfs_end_page_io(fs_page, err);
-                                if (unlikely(fs_page == failed_page))
+                                if (fs_page && fs_page == failed_page)
                                        goto done;
                                fs_page = bh->b_page;
                        }
@@ -2224,10 +2207,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                nilfs_segctor_complete_write(sci);
                /* Commit segments */
-                if (has_sr) {
+                if (has_sr)
-                        nilfs_segctor_commit_free_segments(sci);
                        nilfs_segctor_clear_metadata_dirty(sci);
-                }
                nilfs_segctor_end_construction(sci, nilfs, 0);
@@ -2301,48 +2282,6 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino)
                                        /* assign bit 0 to data files */
 }
-int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
-                                           __u64 *segnum, size_t nsegs)
-{
-        struct nilfs_segment_entry *ent;
-        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        struct inode *sufile = nilfs->ns_sufile;
-        LIST_HEAD(list);
-        __u64 *pnum;
-        size_t i;
-        int err;
-        for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
-                ent = nilfs_alloc_segment_entry(*pnum);
-                if (unlikely(!ent)) {
-                        err = -ENOMEM;
-                        goto failed;
-                }
-                list_add_tail(&ent->list, &list);
-                err = nilfs_open_segment_entry(ent, sufile);
-                if (unlikely(err))
-                        goto failed;
-                if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
-                        printk(KERN_WARNING "NILFS: unused segment is "
-                               "requested to be cleaned (segnum=%llu)\n",
-                               (unsigned long long)ent->segnum);
-                nilfs_close_segment_entry(ent, sufile);
-        }
-        list_splice(&list, sci->sc_cleaning_segments.prev);
-        return 0;
- failed:
-        nilfs_dispose_segment_list(&list);
-        return err;
-}
-void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
-{
-        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
-}
 struct nilfs_segctor_wait_request {
        wait_queue_t    wq;
        __u32           seq;
@@ -2607,10 +2546,13 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        err = nilfs_init_gcdat_inode(nilfs);
        if (unlikely(err))
                goto out_unlock;
        err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
        if (unlikely(err))
                goto out_unlock;
+        sci->sc_freesegs = kbufs[4];
+        sci->sc_nfreesegs = argv[4].v_nmembs;
        list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
        for (;;) {
@@ -2629,6 +2571,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        }
 out_unlock:
+        sci->sc_freesegs = NULL;
+        sci->sc_nfreesegs = 0;
        nilfs_clear_gcdat_inode(nilfs);
        nilfs_transaction_unlock(sbi);
        return err;
@@ -2835,7 +2779,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        INIT_LIST_HEAD(&sci->sc_dirty_files);
        INIT_LIST_HEAD(&sci->sc_segbufs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
-        INIT_LIST_HEAD(&sci->sc_cleaning_segments);
        INIT_LIST_HEAD(&sci->sc_copied_buffers);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2901,9 +2844,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
                nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
        }
-        if (!list_empty(&sci->sc_cleaning_segments))
-                nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
        WARN_ON(!list_empty(&sci->sc_segbufs));
        down_write(&sbi->s_nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 476bdd5df5be..0d2a475a741b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -90,8 +90,9 @@ struct nilfs_segsum_pointer {
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
 * @sc_gc_inodes: List of GC inodes having blocks to be written
- * @sc_cleaning_segments: List of segments to be freed through construction
 * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
+ * @sc_freesegs: array of segment numbers to be freed
+ * @sc_nfreesegs: number of segments on @sc_freesegs
 * @sc_dsync_inode: inode whose data pages are written for a sync operation
 * @sc_dsync_start: start byte offset of data pages
 * @sc_dsync_end: end byte offset of data pages (inclusive)
@@ -131,9 +132,11 @@ struct nilfs_sc_info {
        struct list_head        sc_dirty_files;
        struct list_head        sc_gc_inodes;
-        struct list_head        sc_cleaning_segments;
        struct list_head        sc_copied_buffers;
+        __u64                  *sc_freesegs;
+        size_t                  sc_nfreesegs;
        struct nilfs_inode_info *sc_dsync_inode;
        loff_t                  sc_dsync_start;
        loff_t                  sc_dsync_end;
@@ -225,10 +228,6 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
                                void **);
-extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
-                                                  __u64 *, size_t);
-extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
 extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
@@ -240,5 +239,6 @@ extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
 extern int nilfs_recover_logical_segments(struct the_nilfs *,
                                          struct nilfs_sb_info *,
                                          struct nilfs_recovery_info *);
+extern void nilfs_dispose_segment_list(struct list_head *);
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 98e68677f045..37994d4a59cc 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,6 +18,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Written by Koji Sato <koji@osrg.net>.
+ * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
 */
 #include <linux/kernel.h>
@@ -108,6 +109,102 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
        nilfs_mdt_mark_buffer_dirty(header_bh);
 }
+/**
+ * nilfs_sufile_updatev - modify multiple segment usages at a time
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @create: creation flag
+ * @ndone: place to store number of modified segments on @segnumv
+ * @dofunc: primitive operation for the update
+ *
+ * Description: nilfs_sufile_updatev() repeatedly calls @dofunc
+ * against the given array of segments.  The @dofunc is called with
+ * buffers of a header block and the sufile block in which the target
+ * segment usage entry is contained.  If @ndone is given, the number
+ * of successfully modified segments from the head is stored in the
+ * place @ndone points to.
+ *
+ * Return Value: On success, zero is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Given segment usage is in hole block (may be returned if
+ *            @create is zero)
+ *
+ * %-EINVAL - Invalid segment usage number
+ */
+int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
+                         int create, size_t *ndone,
+                         void (*dofunc)(struct inode *, __u64,
+                                        struct buffer_head *,
+                                        struct buffer_head *))
+{
+        struct buffer_head *header_bh, *bh;
+        unsigned long blkoff, prev_blkoff;
+        __u64 *seg;
+        size_t nerr = 0, n = 0;
+        int ret = 0;
+        if (unlikely(nsegs == 0))
+                goto out;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        for (seg = segnumv; seg < segnumv + nsegs; seg++) {
+                if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
+                        printk(KERN_WARNING
+                               "%s: invalid segment number: %llu\n", __func__,
+                               (unsigned long long)*seg);
+                        nerr++;
+                }
+        }
+        if (nerr > 0) {
+                ret = -EINVAL;
+                goto out_sem;
+        }
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        seg = segnumv;
+        blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+        ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+        if (ret < 0)
+                goto out_header;
+        for (;;) {
+                dofunc(sufile, *seg, header_bh, bh);
+                if (++seg >= segnumv + nsegs)
+                        break;
+                prev_blkoff = blkoff;
+                blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+                if (blkoff == prev_blkoff)
+                        continue;
+                /* get different block */
+                brelse(bh);
+                ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+                if (unlikely(ret < 0))
+                        goto out_header;
+        }
+        brelse(bh);
+ out_header:
+        n = seg - segnumv;
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+ out:
+        if (ndone)
+                *ndone = n;
+        return ret;
+}
 int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
                        void (*dofunc)(struct inode *, __u64,
                                       struct buffer_head *,
@@ -490,7 +587,8 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 * nilfs_sufile_get_suinfo -
 * @sufile: inode of segment usage file
 * @segnum: segment number to start looking
- * @si: array of suinfo
+ * @buf: array of suinfo
+ * @sisz: byte size of suinfo
 * @nsi: size of suinfo array
 *
 * Description:
@@ -502,11 +600,12 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 *
 * %-ENOMEM - Insufficient amount of memory available.
 */
-ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
-                                struct nilfs_suinfo *si, size_t nsi)
+                                unsigned sisz, size_t nsi)
 {
        struct buffer_head *su_bh;
        struct nilfs_segment_usage *su;
+        struct nilfs_suinfo *si = buf;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
        void *kaddr;
@@ -531,20 +630,22 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
                        if (ret != -ENOENT)
                                goto out;
                        /* hole */
-                        memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
+                        memset(si, 0, sisz * n);
+                        si = (void *)si + sisz * n;
                        continue;
                }
                kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
                su = nilfs_sufile_block_get_segment_usage(
                        sufile, segnum, su_bh, kaddr);
-                for (j = 0; j < n; j++, su = (void *)su + susz) {
+                for (j = 0; j < n;
-                        si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
+                     j++, su = (void *)su + susz, si = (void *)si + sisz) {
-                        si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
+                        si->sui_lastmod = le64_to_cpu(su->su_lastmod);
-                        si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
+                        si->sui_nblocks = le32_to_cpu(su->su_nblocks);
+                        si->sui_flags = le32_to_cpu(su->su_flags) &
                                ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
                        if (nilfs_segment_is_active(nilfs, segnum + j))
-                                si[i + j].sui_flags |=
+                                si->sui_flags |=
                                        (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
                }
                kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2e2efd4ade1..a2c4d76c3366 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -43,43 +43,27 @@ void nilfs_sufile_put_segment_usage(struct inode *, __u64,
                                    struct buffer_head *);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
 int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
-ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
                                size_t);
+int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
+                         void (*dofunc)(struct inode *, __u64,
+                                        struct buffer_head *,
+                                        struct buffer_head *));
 int nilfs_sufile_update(struct inode *, __u64, int,
                        void (*dofunc)(struct inode *, __u64,
                                       struct buffer_head *,
                                       struct buffer_head *));
-void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
-                                 struct buffer_head *);
 void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
                           struct buffer_head *);
 void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
                          struct buffer_head *);
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+                                 struct buffer_head *);
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
                               struct buffer_head *);
 /**
- * nilfs_sufile_cancel_free -
- * @sufile: inode of segment usage file
- * @segnum: segment number
- *
- * Description:
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
-{
-        return nilfs_sufile_update(sufile, segnum, 0,
-                                   nilfs_sufile_do_cancel_free);
-}
-/**
 * nilfs_sufile_scrap - make a segment garbage
 * @sufile: inode of segment usage file
 * @segnum: segment number to be freed
@@ -100,6 +84,38 @@ static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
 }
 /**
+ * nilfs_sufile_freev - free segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of freed segments
+ */
+static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
+                                     size_t nsegs, size_t *ndone)
+{
+        return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+                                    nilfs_sufile_do_free);
+}
+/**
+ * nilfs_sufile_cancel_freev - reallocate freeing segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of cancelled segments
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error codes
+ * is returned.
+ */
+static inline int nilfs_sufile_cancel_freev(struct inode *sufile,
+                                            __u64 *segnumv, size_t nsegs,
+                                            size_t *ndone)
+{
+        return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+                                    nilfs_sufile_do_cancel_free);
+}
+/**
 * nilfs_sufile_set_error - mark a segment as erroneous
 * @sufile: inode of segment usage file
 * @segnum: segment number
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6989b03e97ab..151964f0de4c 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -65,9 +65,8 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
+static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
-static int test_exclusive_mount(struct file_system_type *fs_type,
-                                struct block_device *bdev, int flags);
 /**
 * nilfs_error() - report failure condition on a filesystem
@@ -134,7 +133,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
 static struct kmem_cache *nilfs_inode_cachep;
-struct inode *nilfs_alloc_inode(struct super_block *sb)
+struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 {
        struct nilfs_inode_info *ii;
@@ -144,10 +143,15 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        ii->i_bh = NULL;
        ii->i_state = 0;
        ii->vfs_inode.i_version = 1;
-        nilfs_btnode_cache_init(&ii->i_btnode_cache);
+        nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
        return &ii->vfs_inode;
 }
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+        return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
+}
 void nilfs_destroy_inode(struct inode *inode)
 {
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
@@ -185,16 +189,6 @@ static void nilfs_clear_inode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-#ifdef CONFIG_NILFS_POSIX_ACL
-        if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
-                posix_acl_release(ii->i_acl);
-                ii->i_acl = NILFS_ACL_NOT_CACHED;
-        }
-        if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
-                posix_acl_release(ii->i_default_acl);
-                ii->i_default_acl = NILFS_ACL_NOT_CACHED;
-        }
-#endif
        /*
         * Free resources allocated in nilfs_read_inode(), here.
         */
@@ -315,6 +309,11 @@ static void nilfs_put_super(struct super_block *sb)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
+        lock_kernel();
+        if (sb->s_dirt)
+                nilfs_write_super(sb);
        nilfs_detach_segment_constructor(sbi);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -323,12 +322,18 @@ static void nilfs_put_super(struct super_block *sb)
                nilfs_commit_super(sbi, 1);
                up_write(&nilfs->ns_sem);
        }
+        down_write(&nilfs->ns_super_sem);
+        if (nilfs->ns_current == sbi)
+                nilfs->ns_current = NULL;
+        up_write(&nilfs->ns_super_sem);
        nilfs_detach_checkpoint(sbi);
        put_nilfs(sbi->s_nilfs);
        sbi->s_super = NULL;
        sb->s_fs_info = NULL;
-        kfree(sbi);
+        nilfs_put_sbinfo(sbi);
+        unlock_kernel();
 }
 /**
@@ -383,6 +388,8 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
        int err = 0;
+        nilfs_write_super(sb);
        /* This function is called when super block should be written back */
        if (wait)
                err = nilfs_construct_segment(sb);
@@ -396,9 +403,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        struct buffer_head *bh_cp;
        int err;
-        down_write(&nilfs->ns_sem);
+        down_write(&nilfs->ns_super_sem);
        list_add(&sbi->s_list, &nilfs->ns_supers);
-        up_write(&nilfs->ns_sem);
+        up_write(&nilfs->ns_super_sem);
        sbi->s_ifile = nilfs_mdt_new(
                nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
@@ -409,8 +416,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        if (unlikely(err))
                goto failed;
+        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
                                          &bh_cp);
+        up_read(&nilfs->ns_segctor_sem);
        if (unlikely(err)) {
                if (err == -ENOENT || err == -EINVAL) {
                        printk(KERN_ERR
@@ -436,9 +445,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
-        down_write(&nilfs->ns_sem);
+        down_write(&nilfs->ns_super_sem);
        list_del_init(&sbi->s_list);
-        up_write(&nilfs->ns_sem);
+        up_write(&nilfs->ns_super_sem);
        return err;
 }
@@ -450,9 +459,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
        nilfs_mdt_clear(sbi->s_ifile);
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
-        down_write(&nilfs->ns_sem);
+        down_write(&nilfs->ns_super_sem);
        list_del_init(&sbi->s_list);
-        up_write(&nilfs->ns_sem);
+        up_write(&nilfs->ns_super_sem);
 }
 static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
@@ -752,7 +761,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 * @silent: silent mode flag
 * @nilfs: the_nilfs struct
 *
- * This function is called exclusively by bd_mount_mutex.
+ * This function is called exclusively by nilfs->ns_mount_mutex.
 * So, the recovery process is protected from other simultaneous mounts.
 */
 static int
@@ -773,6 +782,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        get_nilfs(nilfs);
        sbi->s_nilfs = nilfs;
        sbi->s_super = sb;
+        atomic_set(&sbi->s_count, 1);
        err = init_nilfs(nilfs, sbi, (char *)data);
        if (err)
@@ -870,6 +880,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                goto failed_root;
        }
+        down_write(&nilfs->ns_super_sem);
+        if (!nilfs_test_opt(sbi, SNAPSHOT))
+                nilfs->ns_current = sbi;
+        up_write(&nilfs->ns_super_sem);
        return 0;
 failed_root:
@@ -885,7 +900,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 failed_sbi:
        put_nilfs(nilfs);
        sb->s_fs_info = NULL;
-        kfree(sbi);
+        nilfs_put_sbinfo(sbi);
        return err;
 }
@@ -898,6 +913,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct nilfs_mount_options old_opts;
        int err;
+        lock_kernel();
+        down_write(&nilfs->ns_super_sem);
        old_sb_flags = sb->s_flags;
        old_opts.mount_opt = sbi->s_mount_opt;
        old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -945,14 +963,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                 * store the current valid flag.  (It may have been changed
                 * by fsck since we originally mounted the partition.)
                 */
-                down(&sb->s_bdev->bd_mount_sem);
+                if (nilfs->ns_current && nilfs->ns_current != sbi) {
-                /* Check existing RW-mount */
-                if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because a RW-mount exists.\n",
+                               "remount because an RW-mount exists.\n",
                               sb->s_id);
                        err = -EBUSY;
-                        goto rw_remount_failed;
+                        goto restore_opts;
                }
                if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
                        printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -960,7 +976,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                               "the latest one.\n",
                               sb->s_id);
                        err = -EINVAL;
-                        goto rw_remount_failed;
+                        goto restore_opts;
                }
                sb->s_flags &= ~MS_RDONLY;
                nilfs_clear_opt(sbi, SNAPSHOT);
@@ -968,28 +984,31 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                err = nilfs_attach_segment_constructor(sbi);
                if (err)
-                        goto rw_remount_failed;
+                        goto restore_opts;
                down_write(&nilfs->ns_sem);
                nilfs_setup_super(sbi);
                up_write(&nilfs->ns_sem);
-                up(&sb->s_bdev->bd_mount_sem);
+                nilfs->ns_current = sbi;
        }
 out:
+        up_write(&nilfs->ns_super_sem);
+        unlock_kernel();
        return 0;
- rw_remount_failed:
-        up(&sb->s_bdev->bd_mount_sem);
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.mount_opt;
        sbi->s_snapshot_cno = old_opts.snapshot_cno;
+        up_write(&nilfs->ns_super_sem);
+        unlock_kernel();
        return err;
 }
 struct nilfs_super_data {
        struct block_device *bdev;
+        struct nilfs_sb_info *sbi;
        __u64 cno;
        int flags;
 };
@@ -1048,33 +1067,7 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
        struct nilfs_super_data *sd = data;
-        return s->s_bdev == sd->bdev;
+        return sd->sbi && s->s_fs_info == (void *)sd->sbi;
-}
-static int nilfs_test_bdev_super2(struct super_block *s, void *data)
-{
-        struct nilfs_super_data *sd = data;
-        int ret;
-        if (s->s_bdev != sd->bdev)
-                return 0;
-        if (!((s->s_flags | sd->flags) & MS_RDONLY))
-                return 1; /* Reuse an old R/W-mode super_block */
-        if (s->s_flags & sd->flags & MS_RDONLY) {
-                if (down_read_trylock(&s->s_umount)) {
-                        ret = s->s_root &&
-                                (sd->cno == NILFS_SB(s)->s_snapshot_cno);
-                        up_read(&s->s_umount);
-                        /*
-                         * This path is locked with sb_lock by sget().
-                         * So, drop_super() causes deadlock.
-                         */
-                        return ret;
-                }
-        }
-        return 0;
 }
 static int
@@ -1082,8 +1075,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
             const char *dev_name, void *data, struct vfsmount *mnt)
 {
        struct nilfs_super_data sd;
-        struct super_block *s, *s2;
+        struct super_block *s;
-        struct the_nilfs *nilfs = NULL;
+        struct the_nilfs *nilfs;
        int err, need_to_close = 1;
        sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
@@ -1095,7 +1088,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
         * much more information than normal filesystems to identify mount
         * instance.  For snapshot mounts, not only a mount type (ro-mount
         * or rw-mount) but also a checkpoint number is required.
-         * The results are passed in sget() using nilfs_super_data.
         */
        sd.cno = 0;
        sd.flags = flags;
@@ -1104,64 +1096,59 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                goto failed;
        }
-        /*
+        nilfs = find_or_create_nilfs(sd.bdev);
-         * once the super is inserted into the list by sget, s_umount
+        if (!nilfs) {
-         * will protect the lockfs code from trying to start a snapshot
+                err = -ENOMEM;
-         * while we are mounting
+                goto failed;
-         */
-        down(&sd.bdev->bd_mount_sem);
-        if (!sd.cno &&
-            (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
-                err = (err < 0) ? : -EBUSY;
-                goto failed_unlock;
        }
-        /*
+        mutex_lock(&nilfs->ns_mount_mutex);
-         * Phase-1: search any existent instance and get the_nilfs
-         */
-        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-        if (IS_ERR(s))
-                goto error_s;
-        if (!s->s_root) {
-                err = -ENOMEM;
-                nilfs = alloc_nilfs(sd.bdev);
-                if (!nilfs)
-                        goto cancel_new;
-        } else {
-                struct nilfs_sb_info *sbi = NILFS_SB(s);
+        if (!sd.cno) {
                /*
-                 * s_umount protects super_block from unmount process;
+                 * Check if an exclusive mount exists or not.
-                 * It covers pointers of nilfs_sb_info and the_nilfs.
+                 * Snapshot mounts coexist with a current mount
+                 * (i.e. rw-mount or ro-mount), whereas rw-mount and
+                 * ro-mount are mutually exclusive.
                 */
-                nilfs = sbi->s_nilfs;
+                down_read(&nilfs->ns_super_sem);
-                get_nilfs(nilfs);
+                if (nilfs->ns_current &&
-                up_write(&s->s_umount);
+                    ((nilfs->ns_current->s_super->s_flags ^ flags)
+                     & MS_RDONLY)) {
+                        up_read(&nilfs->ns_super_sem);
+                        err = -EBUSY;
+                        goto failed_unlock;
+                }
+                up_read(&nilfs->ns_super_sem);
+        }
-                /*
+        /*
-                 * Phase-2: search specified snapshot or R/W mode super_block
+         * Find existing nilfs_sb_info struct
-                 */
+         */
-                if (!sd.cno)
+        sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
-                        /* trying to get the latest checkpoint.  */
-                        sd.cno = nilfs_last_cno(nilfs);
-                s2 = sget(fs_type, nilfs_test_bdev_super2,
+        if (!sd.cno)
-                          nilfs_set_bdev_super, &sd);
+                /* trying to get the latest checkpoint.  */
-                deactivate_super(s);
+                sd.cno = nilfs_last_cno(nilfs);
-                /*
-                 * Although deactivate_super() invokes close_bdev_exclusive() at
+        /*
-                 * kill_block_super().  Here, s is an existent mount; we need
+         * Get super block instance holding the nilfs_sb_info struct.
-                 * one more close_bdev_exclusive() call.
+         * A new instance is allocated if no existing mount is present or
-                 */
+         * existing instance has been unmounted.
-                s = s2;
+         */
-                if (IS_ERR(s))
+        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-                        goto error_s;
+        if (sd.sbi)
+                nilfs_put_sbinfo(sd.sbi);
+        if (IS_ERR(s)) {
+                err = PTR_ERR(s);
+                goto failed_unlock;
        }
        if (!s->s_root) {
                char b[BDEVNAME_SIZE];
+                /* New superblock instance created */
                s->s_flags = flags;
                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(sd.bdev));
@@ -1172,26 +1159,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                s->s_flags |= MS_ACTIVE;
                need_to_close = 0;
-        } else if (!(s->s_flags & MS_RDONLY)) {
-                err = -EBUSY;
        }
-        up(&sd.bdev->bd_mount_sem);
+        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
        if (need_to_close)
                close_bdev_exclusive(sd.bdev, flags);
        simple_set_mnt(mnt, s);
        return 0;
- error_s:
-        up(&sd.bdev->bd_mount_sem);
-        if (nilfs)
-                put_nilfs(nilfs);
-        close_bdev_exclusive(sd.bdev, flags);
-        return PTR_ERR(s);
 failed_unlock:
-        up(&sd.bdev->bd_mount_sem);
+        mutex_unlock(&nilfs->ns_mount_mutex);
+        put_nilfs(nilfs);
 failed:
        close_bdev_exclusive(sd.bdev, flags);
@@ -1199,70 +1178,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 cancel_new:
        /* Abandoning the newly allocated superblock */
-        up(&sd.bdev->bd_mount_sem);
+        mutex_unlock(&nilfs->ns_mount_mutex);
-        if (nilfs)
+        put_nilfs(nilfs);
-                put_nilfs(nilfs);
        up_write(&s->s_umount);
        deactivate_super(s);
        /*
         * deactivate_super() invokes close_bdev_exclusive().
         * We must finish all post-cleaning before this call;
-         * put_nilfs() and unlocking bd_mount_sem need the block device.
+         * put_nilfs() needs the block device.
         */
        return err;
 }
-static int nilfs_test_bdev_super3(struct super_block *s, void *data)
-{
-        struct nilfs_super_data *sd = data;
-        int ret;
-        if (s->s_bdev != sd->bdev)
-                return 0;
-        if (down_read_trylock(&s->s_umount)) {
-                ret = (s->s_flags & MS_RDONLY) && s->s_root &&
-                        nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
-                up_read(&s->s_umount);
-                if (ret)
-                        return 0; /* ignore snapshot mounts */
-        }
-        return !((sd->flags ^ s->s_flags) & MS_RDONLY);
-}
-static int __false_bdev_super(struct super_block *s, void *data)
-{
-#if 0 /* XXX: workaround for lock debug. This is not good idea */
-        up_write(&s->s_umount);
-#endif
-        return -EFAULT;
-}
-/**
- * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
- * fs_type: filesystem type
- * bdev: block device
- * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
- * res: pointer to an integer to store result
- *
- * This function must be called within a section protected by bd_mount_mutex.
- */
-static int test_exclusive_mount(struct file_system_type *fs_type,
-                                struct block_device *bdev, int flags)
-{
-        struct super_block *s;
-        struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
-        s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
-        if (IS_ERR(s)) {
-                if (PTR_ERR(s) != -EFAULT)
-                        return PTR_ERR(s);
-                return 0;  /* Not found */
-        }
-        up_write(&s->s_umount);
-        deactivate_super(s);
-        return 1;  /* Found */
-}
 struct file_system_type nilfs_fs_type = {
        .owner    = THIS_MODULE,
        .name     = "nilfs2",
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3be4aa9..8b8889825716 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -32,9 +32,12 @@
 #include "cpfile.h"
 #include "sufile.h"
 #include "dat.h"
-#include "seglist.h"
 #include "segbuf.h"
+static LIST_HEAD(nilfs_objects);
+static DEFINE_SPINLOCK(nilfs_lock);
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
                            sector_t start_blocknr, u64 seq, __u64 cno)
 {
@@ -55,7 +58,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
 * Return Value: On success, pointer to the_nilfs is returned.
 * On error, NULL is returned.
 */
-struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
        struct the_nilfs *nilfs;
@@ -68,7 +71,10 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
        atomic_set(&nilfs->ns_writer_refcount, -1);
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
+        init_rwsem(&nilfs->ns_super_sem);
+        mutex_init(&nilfs->ns_mount_mutex);
        mutex_init(&nilfs->ns_writer_mutex);
+        INIT_LIST_HEAD(&nilfs->ns_list);
        INIT_LIST_HEAD(&nilfs->ns_supers);
        spin_lock_init(&nilfs->ns_last_segment_lock);
        nilfs->ns_gc_inodes_h = NULL;
@@ -78,6 +84,45 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 }
 /**
+ * find_or_create_nilfs - find or create nilfs object
+ * @bdev: block device to which the_nilfs is related
+ *
+ * find_nilfs() looks up an existent nilfs object created on the
+ * device and gets the reference count of the object.  If no nilfs object
+ * is found on the device, a new nilfs object is allocated.
+ *
+ * Return Value: On success, pointer to the nilfs object is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
+{
+        struct the_nilfs *nilfs, *new = NULL;
+ retry:
+        spin_lock(&nilfs_lock);
+        list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
+                if (nilfs->ns_bdev == bdev) {
+                        get_nilfs(nilfs);
+                        spin_unlock(&nilfs_lock);
+                        if (new)
+                                put_nilfs(new);
+                        return nilfs; /* existing object */
+                }
+        }
+        if (new) {
+                list_add_tail(&new->ns_list, &nilfs_objects);
+                spin_unlock(&nilfs_lock);
+                return new; /* new object */
+        }
+        spin_unlock(&nilfs_lock);
+        new = alloc_nilfs(bdev);
+        if (new)
+                goto retry;
+        return NULL; /* insufficient memory */
+}
+/**
 * put_nilfs - release a reference to the_nilfs
 * @nilfs: the_nilfs structure to be released
 *
@@ -86,13 +131,20 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 */
 void put_nilfs(struct the_nilfs *nilfs)
 {
-        if (!atomic_dec_and_test(&nilfs->ns_count))
+        spin_lock(&nilfs_lock);
+        if (!atomic_dec_and_test(&nilfs->ns_count)) {
+                spin_unlock(&nilfs_lock);
                return;
+        }
+        list_del_init(&nilfs->ns_list);
+        spin_unlock(&nilfs_lock);
        /*
-         * Increment of ns_count never occur below because the caller
+         * Increment of ns_count never occurs below because the caller
         * of get_nilfs() holds at least one reference to the_nilfs.
         * Thus its exclusion control is not required here.
         */
        might_sleep();
        if (nilfs_loaded(nilfs)) {
                nilfs_mdt_clear(nilfs->ns_sufile);
@@ -515,7 +567,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
        if (sb->s_blocksize != blocksize) {
-                int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+                int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
                if (blocksize < hw_blocksize) {
                        printk(KERN_ERR
@@ -613,13 +665,63 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
        return ret;
 }
+/**
+ * nilfs_find_sbinfo - find existing nilfs_sb_info structure
+ * @nilfs: nilfs object
+ * @rw_mount: mount type (non-zero value for read/write mount)
+ * @cno: checkpoint number (zero for read-only mount)
+ *
+ * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
+ * @rw_mount and @cno (in case of snapshots) matched.  If no instance
+ * was found, NULL is returned.  Although the super block instance can
+ * be unmounted after this function returns, the nilfs_sb_info struct
+ * is kept on memory until nilfs_put_sbinfo() is called.
+ */
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
+                                        int rw_mount, __u64 cno)
+{
+        struct nilfs_sb_info *sbi;
+        down_read(&nilfs->ns_super_sem);
+        /*
+         * The SNAPSHOT flag and sb->s_flags are supposed to be
+         * protected with nilfs->ns_super_sem.
+         */
+        sbi = nilfs->ns_current;
+        if (rw_mount) {
+                if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
+                        goto found; /* read/write mount */
+                else
+                        goto out;
+        } else if (cno == 0) {
+                if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
+                        goto found; /* read-only mount */
+                else
+                        goto out;
+        }
+        list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+                if (nilfs_test_opt(sbi, SNAPSHOT) &&
+                    sbi->s_snapshot_cno == cno)
+                        goto found; /* snapshot mount */
+        }
+ out:
+        up_read(&nilfs->ns_super_sem);
+        return NULL;
+ found:
+        atomic_inc(&sbi->s_count);
+        up_read(&nilfs->ns_super_sem);
+        return sbi;
+}
 int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
                                int snapshot_mount)
 {
        struct nilfs_sb_info *sbi;
        int ret = 0;
-        down_read(&nilfs->ns_sem);
+        down_read(&nilfs->ns_super_sem);
        if (cno == 0 || cno > nilfs->ns_cno)
                goto out_unlock;
@@ -636,6 +738,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
                ret++;
 out_unlock:
-        up_read(&nilfs->ns_sem);
+        up_read(&nilfs->ns_super_sem);
        return ret;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 30fe58778d05..1b9caafb8662 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -43,12 +43,16 @@ enum {
 * struct the_nilfs - struct to supervise multiple nilfs mount points
 * @ns_flags: flags
 * @ns_count: reference count
+ * @ns_list: list head for nilfs_list
 * @ns_bdev: block device
 * @ns_bdi: backing dev info
 * @ns_writer: back pointer to writable nilfs_sb_info
 * @ns_sem: semaphore for shared states
+ * @ns_super_sem: semaphore for global operations across super block instances
+ * @ns_mount_mutex: mutex protecting mount process of nilfs
 * @ns_writer_mutex: mutex protecting ns_writer attach/detach
 * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_current: back pointer to current mount
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
 * @ns_sbwtime: previous write time of super blocks
@@ -88,15 +92,24 @@ enum {
 struct the_nilfs {
        unsigned long           ns_flags;
        atomic_t                ns_count;
+        struct list_head        ns_list;
        struct block_device    *ns_bdev;
        struct backing_dev_info *ns_bdi;
        struct nilfs_sb_info   *ns_writer;
        struct rw_semaphore     ns_sem;
+        struct rw_semaphore     ns_super_sem;
+        struct mutex            ns_mount_mutex;
        struct mutex            ns_writer_mutex;
        atomic_t                ns_writer_refcount;
        /*
+         * components protected by ns_super_sem
+         */
+        struct nilfs_sb_info   *ns_current;
+        struct list_head        ns_supers;
+        /*
         * used for
         * - loading the latest checkpoint exclusively.
         * - allocating a new full segment.
@@ -108,7 +121,6 @@ struct the_nilfs {
        time_t                  ns_sbwtime[2];
        unsigned                ns_sbsize;
        unsigned                ns_mount_state;
-        struct list_head        ns_supers;
        /*
         * Following fields are dedicated to a writable FS-instance.
@@ -191,11 +203,12 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 #define NILFS_ALTSB_FREQ        60  /* spare superblock */
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *alloc_nilfs(struct block_device *);
+struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
@@ -238,6 +251,12 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        mutex_unlock(&nilfs->ns_writer_mutex);
 }
+static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
+{
+        if (atomic_dec_and_test(&sbi->s_count))
+                kfree(sbi);
+}
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
                        sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 9b0efdad8910..477d37d83b31 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
 #include <linux/errno.h>
 #include <linux/kmod.h>
 #include <linux/spinlock.h>
+#include <asm/byteorder.h>
 static struct nls_table default_table;
 static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] =
    {0,                                                /* end of table    */}
 };
-int
+#define UNICODE_MAX     0x0010ffff
-utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
+#define PLANE_SIZE      0x00010000
+#define SURROGATE_MASK  0xfffff800
+#define SURROGATE_PAIR  0x0000d800
+#define SURROGATE_LOW   0x00000400
+#define SURROGATE_BITS  0x000003ff
+int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
 {
-        long l;
+        unsigned long l;
        int c0, c, nc;
        const struct utf8_table *t;
  
@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
                nc++;
                if ((c0 & t->cmask) == t->cval) {
                        l &= t->lmask;
-                        if (l < t->lval)
+                        if (l < t->lval || l > UNICODE_MAX ||
+                                        (l & SURROGATE_MASK) == SURROGATE_PAIR)
                                return -1;
-                        *p = l;
+                        *pu = (unicode_t) l;
                        return nc;
                }
-                if (n <= nc)
+                if (len <= nc)
                        return -1;
                s++;
                c = (*s ^ 0x80) & 0xFF;
@@ -72,90 +81,133 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
        }
        return -1;
 }
+EXPORT_SYMBOL(utf8_to_utf32);
-int
+int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
-utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
 {
-        __u16 *op;
+        unsigned long l;
-        const __u8 *ip;
-        int size;
-        op = pwcs;
-        ip = s;
-        while (*ip && n > 0) {
-                if (*ip & 0x80) {
-                        size = utf8_mbtowc(op, ip, n);
-                        if (size == -1) {
-                                /* Ignore character and move on */
-                                ip++;
-                                n--;
-                        } else {
-                                op++;
-                                ip += size;
-                                n -= size;
-                        }
-                } else {
-                        *op++ = *ip++;
-                        n--;
-                }
-        }
-        return (op - pwcs);
-}
-int
-utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
-{
-        long l;
        int c, nc;
        const struct utf8_table *t;
-  
        if (!s)
                return 0;
-  
-        l = wc;
+        l = u;
+        if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+                return -1;
        nc = 0;
        for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
                nc++;
                if (l <= t->lmask) {
                        c = t->shift;
-                        *s = t->cval | (l >> c);
+                        *s = (u8) (t->cval | (l >> c));
                        while (c > 0) {
                                c -= 6;
                                s++;
-                                *s = 0x80 | ((l >> c) & 0x3F);
+                                *s = (u8) (0x80 | ((l >> c) & 0x3F));
                        }
                        return nc;
                }
        }
        return -1;
 }
+EXPORT_SYMBOL(utf32_to_utf8);
-int
+int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
-utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
 {
-        const __u16 *ip;
+        u16 *op;
-        __u8 *op;
        int size;
+        unicode_t u;
+        op = pwcs;
+        while (*s && len > 0) {
+                if (*s & 0x80) {
+                        size = utf8_to_utf32(s, len, &u);
+                        if (size < 0) {
+                                /* Ignore character and move on */
+                                size = 1;
+                        } else if (u >= PLANE_SIZE) {
+                                u -= PLANE_SIZE;
+                                *op++ = (wchar_t) (SURROGATE_PAIR |
+                                                ((u >> 10) & SURROGATE_BITS));
+                                *op++ = (wchar_t) (SURROGATE_PAIR |
+                                                SURROGATE_LOW |
+                                                (u & SURROGATE_BITS));
+                        } else {
+                                *op++ = (wchar_t) u;
+                        }
+                        s += size;
+                        len -= size;
+                } else {
+                        *op++ = *s++;
+                        len--;
+                }
+        }
+        return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+        switch (endian) {
+        default:
+                return c;
+        case UTF16_LITTLE_ENDIAN:
+                return __le16_to_cpu(c);
+        case UTF16_BIG_ENDIAN:
+                return __be16_to_cpu(c);
+        }
+}
+int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
+                u8 *s, int maxlen)
+{
+        u8 *op;
+        int size;
+        unsigned long u, v;
        op = s;
-        ip = pwcs;
+        while (len > 0 && maxlen > 0) {
-        while (*ip && maxlen > 0) {
+                u = get_utf16(*pwcs, endian);
-                if (*ip > 0x7f) {
+                if (!u)
-                        size = utf8_wctomb(op, *ip, maxlen);
+                        break;
+                pwcs++;
+                len--;
+                if (u > 0x7f) {
+                        if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+                                if (u & SURROGATE_LOW) {
+                                        /* Ignore character and move on */
+                                        continue;
+                                }
+                                if (len <= 0)
+                                        break;
+                                v = get_utf16(*pwcs, endian);
+                                if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+                                                !(v & SURROGATE_LOW)) {
+                                        /* Ignore character and move on */
+                                        continue;
+                                }
+                                u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+                                                + (v & SURROGATE_BITS);
+                                pwcs++;
+                                len--;
+                        }
+                        size = utf32_to_utf8(u, op, maxlen);
                        if (size == -1) {
                                /* Ignore character and move on */
-                                maxlen--;
                        } else {
                                op += size;
                                maxlen -= size;
                        }
                } else {
-                        *op++ = (__u8) *ip;
+                        *op++ = (u8) u;
+                        maxlen--;
                }
-                ip++;
        }
-        return (op - s);
+        return op - s;
 }
+EXPORT_SYMBOL(utf16s_to_utf8s);
 int register_nls(struct nls_table * nls)
 {
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
 EXPORT_SYMBOL(unload_nls);
 EXPORT_SYMBOL(load_nls);
 EXPORT_SYMBOL(load_nls_default);
-EXPORT_SYMBOL(utf8_mbtowc);
-EXPORT_SYMBOL(utf8_mbstowcs);
-EXPORT_SYMBOL(utf8_wctomb);
-EXPORT_SYMBOL(utf8_wcstombs);
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index aa2c42fdd977..0d60a44acacd 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 {
        int n;
-        if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) {
+        if (boundlen <= 0)
+                return -ENAMETOOLONG;
+        n = utf32_to_utf8(uni, out, boundlen);
+        if (n < 0) {
                *out = '?';
                return -EINVAL;
        }
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
 {
        int n;
+        unicode_t u;
-        if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) {
+        n = utf8_to_utf32(rawstring, boundlen, &u);
+        if (n < 0 || u > MAX_WCHAR_T) {
                *uni = 0x003f;  /* ? */
-                n = -EINVAL;
+                return -EINVAL;
        }
+        *uni = (wchar_t) u;
        return n;
 }
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 50914d7303c6..dffbb0911d02 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,2 +1,5 @@
+config FSNOTIFY
+        def_bool n
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 5a95b6010ce7..0922cc826c46 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,2 +1,4 @@
+obj-$(CONFIG_FSNOTIFY)          += fsnotify.o notification.o group.o inode_mark.o
 obj-y                   += dnotify/
 obj-y                   += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 26adf5dfa646..f9c1ca139d8f 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,5 +1,6 @@
 config DNOTIFY
        bool "Dnotify support"
+        select FSNOTIFY
        default y
        help
          Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b0aa2cde80bd..828a889be909 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -3,6 +3,9 @@
 *
 * Copyright (C) 2000,2001,2002 Stephen Rothwell
 *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * dnotify was largly rewritten to use the new fsnotify infrastructure
+ *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
@@ -21,24 +24,173 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
 int dir_notify_enable __read_mostly = 1;
-static struct kmem_cache *dn_cache __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
+static struct fsnotify_group *dnotify_group __read_mostly;
+static DEFINE_MUTEX(dnotify_mark_mutex);
+/*
+ * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * is being watched by dnotify.  If multiple userspace applications are watching
+ * the same directory with dnotify their information is chained in dn
+ */
+struct dnotify_mark_entry {
+        struct fsnotify_mark_entry fsn_entry;
+        struct dnotify_struct *dn;
+};
-static void redo_inode_mask(struct inode *inode)
+/*
+ * When a process starts or stops watching an inode the set of events which
+ * dnotify cares about for that inode may change.  This function runs the
+ * list of everything receiving dnotify events about this directory and calculates
+ * the set of all those events.  After it updates what dnotify is interested in
+ * it calls the fsnotify function so it can update the set of all events relevant
+ * to this inode.
+ */
+static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 {
-        unsigned long new_mask;
+        __u32 new_mask, old_mask;
        struct dnotify_struct *dn;
+        struct dnotify_mark_entry *dnentry  = container_of(entry,
+                                                           struct dnotify_mark_entry,
+                                                           fsn_entry);
+        assert_spin_locked(&entry->lock);
+        old_mask = entry->mask;
        new_mask = 0;
-        for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
+        for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
-                new_mask |= dn->dn_mask & ~DN_MULTISHOT;
+                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-        inode->i_dnotify_mask = new_mask;
+        entry->mask = new_mask;
+        if (old_mask == new_mask)
+                return;
+        if (entry->inode)
+                fsnotify_recalc_inode_mask(entry->inode);
+}
+/*
+ * Mains fsnotify call where events are delivered to dnotify.
+ * Find the dnotify mark on the relevant inode, run the list of dnotify structs
+ * on that mark and determine which of them has expressed interest in receiving
+ * events of this type.  When found send the correct process and signal and
+ * destroy the dnotify struct if it was not registered to receive multiple
+ * events.
+ */
+static int dnotify_handle_event(struct fsnotify_group *group,
+                                struct fsnotify_event *event)
+{
+        struct fsnotify_mark_entry *entry = NULL;
+        struct dnotify_mark_entry *dnentry;
+        struct inode *to_tell;
+        struct dnotify_struct *dn;
+        struct dnotify_struct **prev;
+        struct fown_struct *fown;
+        to_tell = event->to_tell;
+        spin_lock(&to_tell->i_lock);
+        entry = fsnotify_find_mark_entry(group, to_tell);
+        spin_unlock(&to_tell->i_lock);
+        /* unlikely since we alreay passed dnotify_should_send_event() */
+        if (unlikely(!entry))
+                return 0;
+        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+        spin_lock(&entry->lock);
+        prev = &dnentry->dn;
+        while ((dn = *prev) != NULL) {
+                if ((dn->dn_mask & event->mask) == 0) {
+                        prev = &dn->dn_next;
+                        continue;
+                }
+                fown = &dn->dn_filp->f_owner;
+                send_sigio(fown, dn->dn_fd, POLL_MSG);
+                if (dn->dn_mask & FS_DN_MULTISHOT)
+                        prev = &dn->dn_next;
+                else {
+                        *prev = dn->dn_next;
+                        kmem_cache_free(dnotify_struct_cache, dn);
+                        dnotify_recalc_inode_mask(entry);
+                }
+        }
+        spin_unlock(&entry->lock);
+        fsnotify_put_mark(entry);
+        return 0;
+}
+/*
+ * Given an inode and mask determine if dnotify would be interested in sending
+ * userspace notification for that pair.
+ */
+static bool dnotify_should_send_event(struct fsnotify_group *group,
+                                      struct inode *inode, __u32 mask)
+{
+        struct fsnotify_mark_entry *entry;
+        bool send;
+        /* !dir_notify_enable should never get here, don't waste time checking
+        if (!dir_notify_enable)
+                return 0; */
+        /* not a dir, dnotify doesn't care */
+        if (!S_ISDIR(inode->i_mode))
+                return false;
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        /* no mark means no dnotify watch */
+        if (!entry)
+                return false;
+        mask = (mask & ~FS_EVENT_ON_CHILD);
+        send = (mask & entry->mask);
+        fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
+        return send;
+}
+static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+        struct dnotify_mark_entry *dnentry = container_of(entry,
+                                                          struct dnotify_mark_entry,
+                                                          fsn_entry);
+        BUG_ON(dnentry->dn);
+        kmem_cache_free(dnotify_mark_entry_cache, dnentry);
 }
+static struct fsnotify_ops dnotify_fsnotify_ops = {
+        .handle_event = dnotify_handle_event,
+        .should_send_event = dnotify_should_send_event,
+        .free_group_priv = NULL,
+        .freeing_mark = NULL,
+        .free_event_priv = NULL,
+};
+/*
+ * Called every time a file is closed.  Looks first for a dnotify mark on the
+ * inode.  If one is found run all of the ->dn entries attached to that
+ * mark for one relevant to this process closing the file and remove that
+ * dnotify_struct.  If that was the last dnotify_struct also remove the
+ * fsnotify_mark_entry.
+ */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
+        struct fsnotify_mark_entry *entry;
+        struct dnotify_mark_entry *dnentry;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
@@ -46,145 +198,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
        inode = filp->f_path.dentry->d_inode;
        if (!S_ISDIR(inode->i_mode))
                return;
        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
+        entry = fsnotify_find_mark_entry(dnotify_group, inode);
+        spin_unlock(&inode->i_lock);
+        if (!entry)
+                return;
+        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+        mutex_lock(&dnotify_mark_mutex);
+        spin_lock(&entry->lock);
+        prev = &dnentry->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
-                        redo_inode_mask(inode);
+                        kmem_cache_free(dnotify_struct_cache, dn);
-                        kmem_cache_free(dn_cache, dn);
+                        dnotify_recalc_inode_mask(entry);
                        break;
                }
                prev = &dn->dn_next;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&entry->lock);
+        /* nothing else could have found us thanks to the dnotify_mark_mutex */
+        if (dnentry->dn == NULL)
+                fsnotify_destroy_mark_by_entry(entry);
+        fsnotify_recalc_group_mask(dnotify_group);
+        mutex_unlock(&dnotify_mark_mutex);
+        fsnotify_put_mark(entry);
+}
+/* this conversion is done only at watch creation */
+static __u32 convert_arg(unsigned long arg)
+{
+        __u32 new_mask = FS_EVENT_ON_CHILD;
+        if (arg & DN_MULTISHOT)
+                new_mask |= FS_DN_MULTISHOT;
+        if (arg & DN_DELETE)
+                new_mask |= (FS_DELETE | FS_MOVED_FROM);
+        if (arg & DN_MODIFY)
+                new_mask |= FS_MODIFY;
+        if (arg & DN_ACCESS)
+                new_mask |= FS_ACCESS;
+        if (arg & DN_ATTRIB)
+                new_mask |= FS_ATTRIB;
+        if (arg & DN_RENAME)
+                new_mask |= FS_DN_RENAME;
+        if (arg & DN_CREATE)
+                new_mask |= (FS_CREATE | FS_MOVED_TO);
+        return new_mask;
 }
+/*
+ * If multiple processes watch the same inode with dnotify there is only one
+ * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * onto that mark.  This function either attaches the new dnotify_struct onto
+ * that list, or it |= the mask onto an existing dnofiy_struct.
+ */
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
+                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
+{
+        struct dnotify_struct *odn;
+        odn = dnentry->dn;
+        while (odn != NULL) {
+                /* adding more events to existing dnofiy_struct? */
+                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+                        odn->dn_fd = fd;
+                        odn->dn_mask |= mask;
+                        return -EEXIST;
+                }
+                odn = odn->dn_next;
+        }
+        dn->dn_mask = mask;
+        dn->dn_fd = fd;
+        dn->dn_filp = filp;
+        dn->dn_owner = id;
+        dn->dn_next = dnentry->dn;
+        dnentry->dn = dn;
+        return 0;
+}
+/*
+ * When a process calls fcntl to attach a dnotify watch to a directory it ends
+ * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
+ * attached to the fsnotify_mark.
+ */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
+        struct dnotify_mark_entry *new_dnentry, *dnentry;
+        struct fsnotify_mark_entry *new_entry, *entry;
        struct dnotify_struct *dn;
-        struct dnotify_struct *odn;
-        struct dnotify_struct **prev;
        struct inode *inode;
        fl_owner_t id = current->files;
        struct file *f;
-        int error = 0;
+        int destroy = 0, error = 0;
+        __u32 mask;
+        /* we use these to tell if we need to kfree */
+        new_entry = NULL;
+        dn = NULL;
+        if (!dir_notify_enable) {
+                error = -EINVAL;
+                goto out_err;
+        }
+        /* a 0 mask means we are explicitly removing the watch */
        if ((arg & ~DN_MULTISHOT) == 0) {
                dnotify_flush(filp, id);
-                return 0;
+                error = 0;
+                goto out_err;
        }
-        if (!dir_notify_enable)
-                return -EINVAL;
+        /* dnotify only works on directories */
        inode = filp->f_path.dentry->d_inode;
-        if (!S_ISDIR(inode->i_mode))
+        if (!S_ISDIR(inode->i_mode)) {
-                return -ENOTDIR;
+                error = -ENOTDIR;
-        dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
+                goto out_err;
-        if (dn == NULL)
-                return -ENOMEM;
-        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
-        while ((odn = *prev) != NULL) {
-                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-                        odn->dn_fd = fd;
-                        odn->dn_mask |= arg;
-                        inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-                        goto out_free;
-                }
-                prev = &odn->dn_next;
        }
-        rcu_read_lock();
+        /* expect most fcntl to add new rather than augment old */
-        f = fcheck(fd);
+        dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
-        rcu_read_unlock();
+        if (!dn) {
-        /* we'd lost the race with close(), sod off silently */
+                error = -ENOMEM;
-        /* note that inode->i_lock prevents reordering problems
+                goto out_err;
-         * between accesses to descriptor table and ->i_dnotify */
+        }
-        if (f != filp)
-                goto out_free;
-        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
-        if (error)
+        new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
-                goto out_free;
+        if (!new_dnentry) {
+                error = -ENOMEM;
+                goto out_err;
+        }
-        dn->dn_mask = arg;
+        /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
-        dn->dn_fd = fd;
+        mask = convert_arg(arg);
-        dn->dn_filp = filp;
-        dn->dn_owner = id;
-        inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-        dn->dn_next = inode->i_dnotify;
-        inode->i_dnotify = dn;
-        spin_unlock(&inode->i_lock);
-        return 0;
-out_free:
+        /* set up the new_entry and new_dnentry */
-        spin_unlock(&inode->i_lock);
+        new_entry = &new_dnentry->fsn_entry;
-        kmem_cache_free(dn_cache, dn);
+        fsnotify_init_mark(new_entry, dnotify_free_mark);
-        return error;
+        new_entry->mask = mask;
-}
+        new_dnentry->dn = NULL;
-void __inode_dir_notify(struct inode *inode, unsigned long event)
+        /* this is needed to prevent the fcntl/close race described below */
-{
+        mutex_lock(&dnotify_mark_mutex);
-        struct dnotify_struct * dn;
-        struct dnotify_struct **prev;
-        struct fown_struct *    fown;
-        int                     changed = 0;
+        /* add the new_entry or find an old one. */
        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
+        entry = fsnotify_find_mark_entry(dnotify_group, inode);
-        while ((dn = *prev) != NULL) {
-                if ((dn->dn_mask & event) == 0) {
-                        prev = &dn->dn_next;
-                        continue;
-                }
-                fown = &dn->dn_filp->f_owner;
-                send_sigio(fown, dn->dn_fd, POLL_MSG);
-                if (dn->dn_mask & DN_MULTISHOT)
-                        prev = &dn->dn_next;
-                else {
-                        *prev = dn->dn_next;
-                        changed = 1;
-                        kmem_cache_free(dn_cache, dn);
-                }
-        }
-        if (changed)
-                redo_inode_mask(inode);
        spin_unlock(&inode->i_lock);
-}
+        if (entry) {
+                dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-EXPORT_SYMBOL(__inode_dir_notify);
+                spin_lock(&entry->lock);
+        } else {
+                fsnotify_add_mark(new_entry, dnotify_group, inode);
+                spin_lock(&new_entry->lock);
+                entry = new_entry;
+                dnentry = new_dnentry;
+                /* we used new_entry, so don't free it */
+                new_entry = NULL;
+        }
-/*
+        rcu_read_lock();
- * This is hopelessly wrong, but unfixable without API changes.  At
+        f = fcheck(fd);
- * least it doesn't oops the kernel...
+        rcu_read_unlock();
- *
- * To safely access ->d_parent we need to keep d_move away from it.  Use the
- * dentry's d_lock for this.
- */
-void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-        struct dentry *parent;
-        if (!dir_notify_enable)
+        /* if (f != filp) means that we lost a race and another task/thread
-                return;
+         * actually closed the fd we are still playing with before we grabbed
+         * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
+         * only time we clean up the mark entries we need to get our mark off
+         * the list. */
+        if (f != filp) {
+                /* if we added ourselves, shoot ourselves, it's possible that
+                 * the flush actually did shoot this entry.  That's fine too
+                 * since multiple calls to destroy_mark is perfectly safe, if
+                 * we found a dnentry already attached to the inode, just sod
+                 * off silently as the flush at close time dealt with it.
+                 */
+                if (dnentry == new_dnentry)
+                        destroy = 1;
+                goto out;
+        }
-        spin_lock(&dentry->d_lock);
+        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-        parent = dentry->d_parent;
+        if (error) {
-        if (parent->d_inode->i_dnotify_mask & event) {
+                /* if we added, we must shoot */
-                dget(parent);
+                if (dnentry == new_dnentry)
-                spin_unlock(&dentry->d_lock);
+                        destroy = 1;
-                __inode_dir_notify(parent->d_inode, event);
+                goto out;
-                dput(parent);
-        } else {
-                spin_unlock(&dentry->d_lock);
        }
+        error = attach_dn(dn, dnentry, id, fd, filp, mask);
+        /* !error means that we attached the dn to the dnentry, so don't free it */
+        if (!error)
+                dn = NULL;
+        /* -EEXIST means that we didn't add this new dn and used an old one.
+         * that isn't an error (and the unused dn should be freed) */
+        else if (error == -EEXIST)
+                error = 0;
+        dnotify_recalc_inode_mask(entry);
+out:
+        spin_unlock(&entry->lock);
+        if (destroy)
+                fsnotify_destroy_mark_by_entry(entry);
+        fsnotify_recalc_group_mask(dnotify_group);
+        mutex_unlock(&dnotify_mark_mutex);
+        fsnotify_put_mark(entry);
+out_err:
+        if (new_entry)
+                fsnotify_put_mark(new_entry);
+        if (dn)
+                kmem_cache_free(dnotify_struct_cache, dn);
+        return error;
 }
-EXPORT_SYMBOL_GPL(dnotify_parent);
 static int __init dnotify_init(void)
 {
-        dn_cache = kmem_cache_create("dnotify_cache",
+        dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
-                sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
+        dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
+        dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
+                                              0, &dnotify_fsnotify_ops);
+        if (IS_ERR(dnotify_group))
+                panic("unable to allocate fsnotify group for dnotify\n");
        return 0;
 }
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 000000000000..037e878e03fc
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,188 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+/*
+ * Clear all of the marks on an inode when it is being evicted from core
+ */
+void __fsnotify_inode_delete(struct inode *inode)
+{
+        fsnotify_clear_marks_by_inode(inode);
+}
+EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+/*
+ * Given an inode, first check if we care what happens to our children.  Inotify
+ * and dnotify both tell their parents about events.  If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+ * parent cares.  Thus when an event happens on a child it can quickly tell if
+ * if there is a need to find a parent and send the event to the parent.
+ */
+void __fsnotify_update_child_dentry_flags(struct inode *inode)
+{
+        struct dentry *alias;
+        int watched;
+        if (!S_ISDIR(inode->i_mode))
+                return;
+        /* determine if the children should tell inode about their events */
+        watched = fsnotify_inode_watches_children(inode);
+        spin_lock(&dcache_lock);
+        /* run all of the dentries associated with this inode.  Since this is a
+         * directory, there damn well better only be one item on this list */
+        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+                struct dentry *child;
+                /* run all of the children of the original inode and fix their
+                 * d_flags to indicate parental interest (their parent is the
+                 * original inode) */
+                list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+                        if (!child->d_inode)
+                                continue;
+                        spin_lock(&child->d_lock);
+                        if (watched)
+                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+                        else
+                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+                        spin_unlock(&child->d_lock);
+                }
+        }
+        spin_unlock(&dcache_lock);
+}
+/* Notify this dentry's parent about a child's events. */
+void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+        struct dentry *parent;
+        struct inode *p_inode;
+        bool send = false;
+        bool should_update_children = false;
+        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+                return;
+        spin_lock(&dentry->d_lock);
+        parent = dentry->d_parent;
+        p_inode = parent->d_inode;
+        if (fsnotify_inode_watches_children(p_inode)) {
+                if (p_inode->i_fsnotify_mask & mask) {
+                        dget(parent);
+                        send = true;
+                }
+        } else {
+                /*
+                 * The parent doesn't care about events on it's children but
+                 * at least one child thought it did.  We need to run all the
+                 * children and update their d_flags to let them know p_inode
+                 * doesn't care about them any more.
+                 */
+                dget(parent);
+                should_update_children = true;
+        }
+        spin_unlock(&dentry->d_lock);
+        if (send) {
+                /* we are notifying a parent so come up with the new mask which
+                 * specifies these are events which came from a child. */
+                mask |= FS_EVENT_ON_CHILD;
+                fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+                         dentry->d_name.name, 0);
+                dput(parent);
+        }
+        if (unlikely(should_update_children)) {
+                __fsnotify_update_child_dentry_flags(p_inode);
+                dput(parent);
+        }
+}
+EXPORT_SYMBOL_GPL(__fsnotify_parent);
+/*
+ * This is the main call to fsnotify.  The VFS calls into hook specific functions
+ * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
+ * out to all of the registered fsnotify_group.  Those groups can then use the
+ * notification event in whatever means they feel necessary.
+ */
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
+{
+        struct fsnotify_group *group;
+        struct fsnotify_event *event = NULL;
+        int idx;
+        /* global tests shouldn't care about events on child only the specific event */
+        __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+        if (list_empty(&fsnotify_groups))
+                return;
+        if (!(test_mask & fsnotify_mask))
+                return;
+        if (!(test_mask & to_tell->i_fsnotify_mask))
+                return;
+        /*
+         * SRCU!!  the groups list is very very much read only and the path is
+         * very hot.  The VAST majority of events are not going to need to do
+         * anything other than walk the list so it's crazy to pre-allocate.
+         */
+        idx = srcu_read_lock(&fsnotify_grp_srcu);
+        list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
+                if (test_mask & group->mask) {
+                        if (!group->ops->should_send_event(group, to_tell, mask))
+                                continue;
+                        if (!event) {
+                                event = fsnotify_create_event(to_tell, mask, data,
+                                                              data_is, file_name, cookie,
+                                                              GFP_KERNEL);
+                                /* shit, we OOM'd and now we can't tell, maybe
+                                 * someday someone else will want to do something
+                                 * here */
+                                if (!event)
+                                        break;
+                        }
+                        group->ops->handle_event(group, event);
+                }
+        }
+        srcu_read_unlock(&fsnotify_grp_srcu, idx);
+        /*
+         * fsnotify_create_event() took a reference so the event can't be cleaned
+         * up while we are still trying to add it to lists, drop that one.
+         */
+        if (event)
+                fsnotify_put_event(event);
+}
+EXPORT_SYMBOL_GPL(fsnotify);
+static __init int fsnotify_init(void)
+{
+        return init_srcu_struct(&fsnotify_grp_srcu);
+}
+subsys_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 000000000000..4dc240824b2d
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,34 @@
+#ifndef __FS_NOTIFY_FSNOTIFY_H_
+#define __FS_NOTIFY_FSNOTIFY_H_
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+/* protects reads of fsnotify_groups */
+extern struct srcu_struct fsnotify_grp_srcu;
+/* all groups which receive fsnotify events */
+extern struct list_head fsnotify_groups;
+/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
+extern __u32 fsnotify_mask;
+/* destroy all events sitting in this groups notification queue */
+extern void fsnotify_flush_notify(struct fsnotify_group *group);
+/* final kfree of a group */
+extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+/* run the list of all marks associated with inode and flag them to be freed */
+extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/*
+ * update the dentry->d_flags of all of inode's children to indicate if inode cares
+ * about events that happen to its children.
+ */
+extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+/* allocate and destroy and event holder to attach events to notification/access queues */
+extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+#endif  /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 000000000000..0e1677144bc5
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,254 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/rculist.h>
+#include <linux/wait.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+#include <asm/atomic.h>
+/* protects writes to fsnotify_groups and fsnotify_mask */
+static DEFINE_MUTEX(fsnotify_grp_mutex);
+/* protects reads while running the fsnotify_groups list */
+struct srcu_struct fsnotify_grp_srcu;
+/* all groups registered to receive filesystem notifications */
+LIST_HEAD(fsnotify_groups);
+/* bitwise OR of all events (FS_*) interesting to some group on this system */
+__u32 fsnotify_mask;
+/*
+ * When a new group registers or changes it's set of interesting events
+ * this function updates the fsnotify_mask to contain all interesting events
+ */
+void fsnotify_recalc_global_mask(void)
+{
+        struct fsnotify_group *group;
+        __u32 mask = 0;
+        int idx;
+        idx = srcu_read_lock(&fsnotify_grp_srcu);
+        list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
+                mask |= group->mask;
+        srcu_read_unlock(&fsnotify_grp_srcu, idx);
+        fsnotify_mask = mask;
+}
+/*
+ * Update the group->mask by running all of the marks associated with this
+ * group and finding the bitwise | of all of the mark->mask.  If we change
+ * the group->mask we need to update the global mask of events interesting
+ * to the system.
+ */
+void fsnotify_recalc_group_mask(struct fsnotify_group *group)
+{
+        __u32 mask = 0;
+        __u32 old_mask = group->mask;
+        struct fsnotify_mark_entry *entry;
+        spin_lock(&group->mark_lock);
+        list_for_each_entry(entry, &group->mark_entries, g_list)
+                mask |= entry->mask;
+        spin_unlock(&group->mark_lock);
+        group->mask = mask;
+        if (old_mask != mask)
+                fsnotify_recalc_global_mask();
+}
+/*
+ * Take a reference to a group so things found under the fsnotify_grp_mutex
+ * can't get freed under us
+ */
+static void fsnotify_get_group(struct fsnotify_group *group)
+{
+        atomic_inc(&group->refcnt);
+}
+/*
+ * Final freeing of a group
+ */
+void fsnotify_final_destroy_group(struct fsnotify_group *group)
+{
+        /* clear the notification queue of all events */
+        fsnotify_flush_notify(group);
+        if (group->ops->free_group_priv)
+                group->ops->free_group_priv(group);
+        kfree(group);
+}
+/*
+ * Trying to get rid of a group.  We need to first get rid of any outstanding
+ * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group
+ * could miss marks that are being freed by inode and those marks could still
+ * hold a reference to this group (via group->num_marks)  If we get into that
+ * situtation, the fsnotify_final_destroy_group will get called when that final
+ * mark is freed.
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+        /* clear all inode mark entries for this group */
+        fsnotify_clear_marks_by_group(group);
+        /* past the point of no return, matches the initial value of 1 */
+        if (atomic_dec_and_test(&group->num_marks))
+                fsnotify_final_destroy_group(group);
+}
+/*
+ * Remove this group from the global list of groups that will get events
+ * this can be done even if there are still references and things still using
+ * this group.  This just stops the group from getting new events.
+ */
+static void __fsnotify_evict_group(struct fsnotify_group *group)
+{
+        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+        if (group->on_group_list)
+                list_del_rcu(&group->group_list);
+        group->on_group_list = 0;
+}
+/*
+ * Called when a group is no longer interested in getting events.  This can be
+ * used if a group is misbehaving or if for some reason a group should no longer
+ * get any filesystem events.
+ */
+void fsnotify_evict_group(struct fsnotify_group *group)
+{
+        mutex_lock(&fsnotify_grp_mutex);
+        __fsnotify_evict_group(group);
+        mutex_unlock(&fsnotify_grp_mutex);
+}
+/*
+ * Drop a reference to a group.  Free it if it's through.
+ */
+void fsnotify_put_group(struct fsnotify_group *group)
+{
+        if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
+                return;
+        /*
+         * OK, now we know that there's no other users *and* we hold mutex,
+         * so no new references will appear
+         */
+        __fsnotify_evict_group(group);
+        /*
+         * now it's off the list, so the only thing we might care about is
+         * srcu access....
+         */
+        mutex_unlock(&fsnotify_grp_mutex);
+        synchronize_srcu(&fsnotify_grp_srcu);
+        /* and now it is really dead. _Nothing_ could be seeing it */
+        fsnotify_recalc_global_mask();
+        fsnotify_destroy_group(group);
+}
+/*
+ * Simply run the fsnotify_groups list and find a group which matches
+ * the given parameters.  If a group is found we take a reference to that
+ * group.
+ */
+static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
+                                                  const struct fsnotify_ops *ops)
+{
+        struct fsnotify_group *group_iter;
+        struct fsnotify_group *group = NULL;
+        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+        list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
+                if (group_iter->group_num == group_num) {
+                        if ((group_iter->mask == mask) &&
+                            (group_iter->ops == ops)) {
+                                fsnotify_get_group(group_iter);
+                                group = group_iter;
+                        } else
+                                group = ERR_PTR(-EEXIST);
+                }
+        }
+        return group;
+}
+/*
+ * Either finds an existing group which matches the group_num, mask, and ops or
+ * creates a new group and adds it to the global group list.  In either case we
+ * take a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
+                                             const struct fsnotify_ops *ops)
+{
+        struct fsnotify_group *group, *tgroup;
+        /* very low use, simpler locking if we just always alloc */
+        group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+        if (!group)
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&group->refcnt, 1);
+        group->on_group_list = 0;
+        group->group_num = group_num;
+        group->mask = mask;
+        mutex_init(&group->notification_mutex);
+        INIT_LIST_HEAD(&group->notification_list);
+        init_waitqueue_head(&group->notification_waitq);
+        group->q_len = 0;
+        group->max_events = UINT_MAX;
+        spin_lock_init(&group->mark_lock);
+        atomic_set(&group->num_marks, 0);
+        INIT_LIST_HEAD(&group->mark_entries);
+        group->ops = ops;
+        mutex_lock(&fsnotify_grp_mutex);
+        tgroup = fsnotify_find_group(group_num, mask, ops);
+        if (tgroup) {
+                /* group already exists */
+                mutex_unlock(&fsnotify_grp_mutex);
+                /* destroy the new one we made */
+                fsnotify_put_group(group);
+                return tgroup;
+        }
+        /* group not found, add a new one */
+        list_add_rcu(&group->group_list, &fsnotify_groups);
+        group->on_group_list = 1;
+        /* being on the fsnotify_groups list holds one num_marks */
+        atomic_inc(&group->num_marks);
+        mutex_unlock(&fsnotify_grp_mutex);
+        if (mask)
+                fsnotify_recalc_global_mask();
+        return group;
+}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 000000000000..c8a07c65482b
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,426 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * entry->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the mark_entries list anchored inside a given group
+ * and each entry is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
+ * given inode and each entry is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
+{
+        atomic_inc(&entry->refcnt);
+}
+void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
+{
+        if (atomic_dec_and_test(&entry->refcnt))
+                entry->free_mark(entry);
+}
+/*
+ * Recalculate the mask of events relevant to a given inode locked.
+ */
+static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry;
+        struct hlist_node *pos;
+        __u32 new_mask = 0;
+        assert_spin_locked(&inode->i_lock);
+        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
+                new_mask |= entry->mask;
+        inode->i_fsnotify_mask = new_mask;
+}
+/*
+ * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this inode.
+ */
+void fsnotify_recalc_inode_mask(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        fsnotify_recalc_inode_mask_locked(inode);
+        spin_unlock(&inode->i_lock);
+        __fsnotify_update_child_dentry_flags(inode);
+}
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the entry->lock
+ */
+void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
+{
+        struct fsnotify_group *group;
+        struct inode *inode;
+        spin_lock(&entry->lock);
+        group = entry->group;
+        inode = entry->inode;
+        BUG_ON(group && !inode);
+        BUG_ON(!group && inode);
+        /* if !group something else already marked this to die */
+        if (!group) {
+                spin_unlock(&entry->lock);
+                return;
+        }
+        /* 1 from caller and 1 for being on i_list/g_list */
+        BUG_ON(atomic_read(&entry->refcnt) < 2);
+        spin_lock(&group->mark_lock);
+        spin_lock(&inode->i_lock);
+        hlist_del_init(&entry->i_list);
+        entry->inode = NULL;
+        list_del_init(&entry->g_list);
+        entry->group = NULL;
+        fsnotify_put_mark(entry); /* for i_list and g_list */
+        /*
+         * this mark is now off the inode->i_fsnotify_mark_entries list and we
+         * hold the inode->i_lock, so this is the perfect time to update the
+         * inode->i_fsnotify_mask
+         */
+        fsnotify_recalc_inode_mask_locked(inode);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&entry->lock);
+        /*
+         * Some groups like to know that marks are being freed.  This is a
+         * callback to the group function to let it know that this entry
+         * is being freed.
+         */
+        if (group->ops->freeing_mark)
+                group->ops->freeing_mark(entry, group);
+        /*
+         * __fsnotify_update_child_dentry_flags(inode);
+         *
+         * I really want to call that, but we can't, we have no idea if the inode
+         * still exists the second we drop the entry->lock.
+         *
+         * The next time an event arrive to this inode from one of it's children
+         * __fsnotify_parent will see that the inode doesn't care about it's
+         * children and will update all of these flags then.  So really this
+         * is just a lazy update (and could be a perf win...)
+         */
+        iput(inode);
+        /*
+         * it's possible that this group tried to destroy itself, but this
+         * this mark was simultaneously being freed by inode.  If that's the
+         * case, we finish freeing the group here.
+         */
+        if (unlikely(atomic_dec_and_test(&group->num_marks)))
+                fsnotify_final_destroy_group(group);
+}
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+        struct fsnotify_mark_entry *lentry, *entry;
+        LIST_HEAD(free_list);
+        spin_lock(&group->mark_lock);
+        list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
+                list_add(&entry->free_g_list, &free_list);
+                list_del_init(&entry->g_list);
+                fsnotify_get_mark(entry);
+        }
+        spin_unlock(&group->mark_lock);
+        list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+}
+/*
+ * Given an inode, destroy all of the marks associated with that inode.
+ */
+void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry, *lentry;
+        struct hlist_node *pos, *n;
+        LIST_HEAD(free_list);
+        spin_lock(&inode->i_lock);
+        hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
+                list_add(&entry->free_i_list, &free_list);
+                hlist_del_init(&entry->i_list);
+                fsnotify_get_mark(entry);
+        }
+        spin_unlock(&inode->i_lock);
+        list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+}
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
+                                                     struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry;
+        struct hlist_node *pos;
+        assert_spin_locked(&inode->i_lock);
+        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+                if (entry->group == group) {
+                        fsnotify_get_mark(entry);
+                        return entry;
+                }
+        }
+        return NULL;
+}
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
+                        void (*free_mark)(struct fsnotify_mark_entry *entry))
+{
+        spin_lock_init(&entry->lock);
+        atomic_set(&entry->refcnt, 1);
+        INIT_HLIST_NODE(&entry->i_list);
+        entry->group = NULL;
+        entry->mask = 0;
+        entry->inode = NULL;
+        entry->free_mark = free_mark;
+}
+/*
+ * Attach an initialized mark entry to a given group and inode.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group and for which inodes.
+ */
+int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
+                      struct fsnotify_group *group, struct inode *inode)
+{
+        struct fsnotify_mark_entry *lentry;
+        int ret = 0;
+        inode = igrab(inode);
+        if (unlikely(!inode))
+                return -EINVAL;
+        /*
+         * LOCKING ORDER!!!!
+         * entry->lock
+         * group->mark_lock
+         * inode->i_lock
+         */
+        spin_lock(&entry->lock);
+        spin_lock(&group->mark_lock);
+        spin_lock(&inode->i_lock);
+        entry->group = group;
+        entry->inode = inode;
+        lentry = fsnotify_find_mark_entry(group, inode);
+        if (!lentry) {
+                hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
+                list_add(&entry->g_list, &group->mark_entries);
+                fsnotify_get_mark(entry); /* for i_list and g_list */
+                atomic_inc(&group->num_marks);
+                fsnotify_recalc_inode_mask_locked(inode);
+        }
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&entry->lock);
+        if (lentry) {
+                ret = -EEXIST;
+                iput(inode);
+                fsnotify_put_mark(lentry);
+        } else {
+                __fsnotify_update_child_dentry_flags(inode);
+        }
+        return ret;
+}
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+        struct inode *inode, *next_i, *need_iput = NULL;
+        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+                struct inode *need_iput_tmp;
+                /*
+                 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+                 * I_WILL_FREE, or I_NEW which is fine because by that point
+                 * the inode cannot have any associated watches.
+                 */
+                if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+                        continue;
+                /*
+                 * If i_count is zero, the inode cannot have any watches and
+                 * doing an __iget/iput with MS_ACTIVE clear would actually
+                 * evict all inodes with zero i_count from icache which is
+                 * unnecessarily violent and may in fact be illegal to do.
+                 */
+                if (!atomic_read(&inode->i_count))
+                        continue;
+                need_iput_tmp = need_iput;
+                need_iput = NULL;
+                /* In case fsnotify_inode_delete() drops a reference. */
+                if (inode != need_iput_tmp)
+                        __iget(inode);
+                else
+                        need_iput_tmp = NULL;
+                /* In case the dropping of a reference would nuke next_i. */
+                if ((&next_i->i_sb_list != list) &&
+                    atomic_read(&next_i->i_count) &&
+                    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+                        __iget(next_i);
+                        need_iput = next_i;
+                }
+                /*
+                 * We can safely drop inode_lock here because we hold
+                 * references on both inode and next_i.  Also no new inodes
+                 * will be added since the umount has begun.  Finally,
+                 * iprune_mutex keeps shrink_icache_memory() away.
+                 */
+                spin_unlock(&inode_lock);
+                if (need_iput_tmp)
+                        iput(need_iput_tmp);
+                /* for each watch, send FS_UNMOUNT and then remove it */
+                fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+                fsnotify_inode_delete(inode);
+                iput(inode);
+                spin_lock(&inode_lock);
+        }
+}
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 446792841023..3e56dbffe729 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,26 +1,30 @@
 config INOTIFY
        bool "Inotify file change notification support"
-        default y
+        default n
        ---help---
-          Say Y here to enable inotify support.  Inotify is a file change
+          Say Y here to enable legacy in kernel inotify support.  Inotify is a
-          notification system and a replacement for dnotify.  Inotify fixes
+          file change notification system.  It is a replacement for dnotify.
-          numerous shortcomings in dnotify and introduces several new features
+          This option only provides the legacy inotify in kernel API.  There
-          including multiple file events, one-shot support, and unmount
+          are no in tree kernel users of this interface since it is deprecated.
-          notification.
+          You only need this if you are loading an out of tree kernel module
+          that uses inotify.
          For more information, see <file:Documentation/filesystems/inotify.txt>
-          If unsure, say Y.
+          If unsure, say N.
 config INOTIFY_USER
        bool "Inotify support for userspace"
-        depends on INOTIFY
+        select FSNOTIFY
        default y
        ---help---
          Say Y here to enable inotify support for userspace, including the
          associated system calls.  Inotify allows monitoring of both files and
          directories via a single open fd.  Events are read from the file
          descriptor, which is also select()- and poll()-able.
+          Inotify fixes numerous shortcomings in dnotify and introduces several
+          new features including multiple file events, one-shot support, and
+          unmount notification.
          For more information, see <file:Documentation/filesystems/inotify.txt>
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index e290f3bb9d8d..943828171362 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INOTIFY)           += inotify.o
-obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
+obj-$(CONFIG_INOTIFY_USER)      += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 220c13f0d73d..40b1cf914ccb 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -32,6 +32,7 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 static atomic_t inotify_cookie;
@@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
 */
 static int __init inotify_setup(void)
 {
+        BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+        BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+        BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+        BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+        BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+        BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+        BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+        BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+        BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+        BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+        BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+        BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+        BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+        BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+        BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+        BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+        BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
        atomic_set(&inotify_cookie, 0);
        return 0;
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 000000000000..f234f3a4c8ca
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,22 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/slab.h> /* struct kmem_cache */
+extern struct kmem_cache *event_priv_cachep;
+struct inotify_event_private_data {
+        struct fsnotify_event_private_data fsnotify_event_priv_data;
+        int wd;
+};
+struct inotify_inode_mark_entry {
+        /* fsnotify_mark_entry MUST be the first thing */
+        struct fsnotify_mark_entry fsn_entry;
+        int wd;
+};
+extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+                                           struct fsnotify_group *group);
+extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 000000000000..c9ee67b442e1
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,168 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *      John McCutchan  <ttb@tentacle.dhs.org>
+ *      Robert Love     <rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/path.h> /* struct path */
+#include <linux/slab.h> /* kmem_* */
+#include <linux/types.h>
+#include "inotify.h"
+static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark_entry *ientry;
+        struct inode *to_tell;
+        struct inotify_event_private_data *event_priv;
+        struct fsnotify_event_private_data *fsn_event_priv;
+        int wd, ret;
+        to_tell = event->to_tell;
+        spin_lock(&to_tell->i_lock);
+        entry = fsnotify_find_mark_entry(group, to_tell);
+        spin_unlock(&to_tell->i_lock);
+        /* race with watch removal?  We already passes should_send */
+        if (unlikely(!entry))
+                return 0;
+        ientry = container_of(entry, struct inotify_inode_mark_entry,
+                              fsn_entry);
+        wd = ientry->wd;
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        if (unlikely(!event_priv))
+                return -ENOMEM;
+        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event_priv->group = group;
+        event_priv->wd = wd;
+        ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
+        if (ret) {
+                inotify_free_event_priv(fsn_event_priv);
+                /* EEXIST says we tail matched, EOVERFLOW isn't something
+                 * to report up the stack. */
+                if ((ret == -EEXIST) ||
+                    (ret == -EOVERFLOW))
+                        ret = 0;
+        }
+        /*
+         * If we hold the entry until after the event is on the queue
+         * IN_IGNORED won't be able to pass this event in the queue
+         */
+        fsnotify_put_mark(entry);
+        return ret;
+}
+static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+        inotify_ignored_and_remove_idr(entry, group);
+}
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+{
+        struct fsnotify_mark_entry *entry;
+        bool send;
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        if (!entry)
+                return false;
+        mask = (mask & ~FS_EVENT_ON_CHILD);
+        send = (entry->mask & mask);
+        /* find took a reference */
+        fsnotify_put_mark(entry);
+        return send;
+}
+/*
+ * This is NEVER supposed to be called.  Inotify marks should either have been
+ * removed from the idr when the watch was removed or in the
+ * fsnotify_destroy_mark_by_group() call when the inotify instance was being
+ * torn down.  This is only called if the idr is about to be freed but there
+ * are still marks in it.
+ */
+static int idr_callback(int id, void *p, void *data)
+{
+        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark_entry *ientry;
+        static bool warned = false;
+        if (warned)
+                return 0;
+        warned = false;
+        entry = p;
+        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
+                "idr.  Probably leaking memory\n", id, p, data);
+        /*
+         * I'm taking the liberty of assuming that the mark in question is a
+         * valid address and I'm dereferencing it.  This might help to figure
+         * out why we got here and the panic is no worse than the original
+         * BUG() that was here.
+         */
+        if (entry)
+                printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
+                        entry->group, entry->inode, ientry->wd);
+        return 0;
+}
+static void inotify_free_group_priv(struct fsnotify_group *group)
+{
+        /* ideally the idr is empty and we won't hit the BUG in teh callback */
+        idr_for_each(&group->inotify_data.idr, idr_callback, group);
+        idr_remove_all(&group->inotify_data.idr);
+        idr_destroy(&group->inotify_data.idr);
+}
+void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+{
+        struct inotify_event_private_data *event_priv;
+        event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
+                                  fsnotify_event_priv_data);
+        kmem_cache_free(event_priv_cachep, event_priv);
+}
+const struct fsnotify_ops inotify_fsnotify_ops = {
+        .handle_event = inotify_handle_event,
+        .should_send_event = inotify_should_send_event,
+        .free_group_priv = inotify_free_group_priv,
+        .free_event_priv = inotify_free_event_priv,
+        .freeing_mark = inotify_freeing_mark,
+};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1634319e2404..0e781bc88d1e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -8,6 +8,9 @@
 * Copyright (C) 2005 John McCutchan
 * Copyright 2006 Hewlett-Packard Development Company, L.P.
 *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
@@ -19,94 +22,44 @@
 * General Public License for more details.
 */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
+#include <linux/fs.h> /* struct inode */
-#include <linux/namei.h>
+#include <linux/fsnotify_backend.h>
-#include <linux/poll.h>
+#include <linux/idr.h>
-#include <linux/init.h>
+#include <linux/init.h> /* module_init */
-#include <linux/list.h>
 #include <linux/inotify.h>
+#include <linux/kernel.h> /* roundup() */
+#include <linux/magic.h> /* superblock magic number */
+#include <linux/mount.h> /* mntget */
+#include <linux/namei.h> /* LOOKUP_FOLLOW */
+#include <linux/path.h> /* struct path */
+#include <linux/sched.h> /* struct user */
+#include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
-#include <linux/magic.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
-#include <asm/ioctls.h>
+#include "inotify.h"
-static struct kmem_cache *watch_cachep __read_mostly;
+#include <asm/ioctls.h>
-static struct kmem_cache *event_cachep __read_mostly;
 static struct vfsmount *inotify_mnt __read_mostly;
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 static int inotify_max_queued_events __read_mostly;
+int inotify_max_user_watches __read_mostly;
-/*
+static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
- * Lock ordering:
+struct kmem_cache *event_priv_cachep __read_mostly;
- *
- * inotify_dev->up_mutex (ensures we don't re-add the same watch)
- *      inode->inotify_mutex (protects inode's watch list)
- *              inotify_handle->mutex (protects inotify_handle's watch list)
- *                      inotify_dev->ev_mutex (protects device's event queue)
- */
-/*
- * Lifetimes of the main data structures:
- *
- * inotify_device: Lifetime is managed by reference count, from
- * sys_inotify_init() until release.  Additional references can bump the count
- * via get_inotify_dev() and drop the count via put_inotify_dev().
- *
- * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
- * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
- * first event, or to inotify_destroy().
- */
-/*
- * struct inotify_device - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_device {
-        wait_queue_head_t       wq;             /* wait queue for i/o */
-        struct mutex            ev_mutex;       /* protects event queue */
-        struct mutex            up_mutex;       /* synchronizes watch updates */
-        struct list_head        events;         /* list of queued events */
-        struct user_struct      *user;          /* user who opened this dev */
-        struct inotify_handle   *ih;            /* inotify handle */
-        struct fasync_struct    *fa;            /* async notification */
-        atomic_t                count;          /* reference count */
-        unsigned int            queue_size;     /* size of the queue (bytes) */
-        unsigned int            event_count;    /* number of pending events */
-        unsigned int            max_events;     /* maximum number of events */
-};
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->ev_mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-        struct inotify_event    event;  /* the user-space event */
-        struct list_head        list;   /* entry in inotify_device's list */
-        char                    *name;  /* filename, if any */
-};
 /*
- * struct inotify_user_watch - our version of an inotify_watch, we add
+ * When inotify registers a new group it increments this and uses that
- * a reference to the associated inotify_device.
+ * value as an offset to set the fsnotify group "name" and priority.
 */
-struct inotify_user_watch {
+static atomic_t inotify_grp_num;
-        struct inotify_device   *dev;   /* associated device */
-        struct inotify_watch    wdata;  /* inotify watch data */
-};
 #ifdef CONFIG_SYSCTL
@@ -149,280 +102,36 @@ ctl_table inotify_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
-static inline void get_inotify_dev(struct inotify_device *dev)
+static inline __u32 inotify_arg_to_mask(u32 arg)
-{
-        atomic_inc(&dev->count);
-}
-static inline void put_inotify_dev(struct inotify_device *dev)
-{
-        if (atomic_dec_and_test(&dev->count)) {
-                atomic_dec(&dev->user->inotify_devs);
-                free_uid(dev->user);
-                kfree(dev);
-        }
-}
-/*
- * free_inotify_user_watch - cleans up the watch and its references
- */
-static void free_inotify_user_watch(struct inotify_watch *w)
-{
-        struct inotify_user_watch *watch;
-        struct inotify_device *dev;
-        watch = container_of(w, struct inotify_user_watch, wdata);
-        dev = watch->dev;
-        atomic_dec(&dev->user->inotify_watches);
-        put_inotify_dev(dev);
-        kmem_cache_free(watch_cachep, watch);
-}
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-                                                  const char *name)
-{
-        struct inotify_kernel_event *kevent;
-        kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
-        if (unlikely(!kevent))
-                return NULL;
-        /* we hand this out to user-space, so zero it just in case */
-        memset(&kevent->event, 0, sizeof(struct inotify_event));
-        kevent->event.wd = wd;
-        kevent->event.mask = mask;
-        kevent->event.cookie = cookie;
-        INIT_LIST_HEAD(&kevent->list);
-        if (name) {
-                size_t len, rem, event_size = sizeof(struct inotify_event);
-                /*
-                 * We need to pad the filename so as to properly align an
-                 * array of inotify_event structures.  Because the structure is
-                 * small and the common case is a small filename, we just round
-                 * up to the next multiple of the structure's sizeof.  This is
-                 * simple and safe for all architectures.
-                 */
-                len = strlen(name) + 1;
-                rem = event_size - len;
-                if (len > event_size) {
-                        rem = event_size - (len % event_size);
-                        if (len % event_size == 0)
-                                rem = 0;
-                }
-                kevent->name = kmalloc(len + rem, GFP_NOFS);
-                if (unlikely(!kevent->name)) {
-                        kmem_cache_free(event_cachep, kevent);
-                        return NULL;
-                }
-                memcpy(kevent->name, name, len);
-                if (rem)
-                        memset(kevent->name + len, 0, rem);
-                kevent->event.len = len + rem;
-        } else {
-                kevent->event.len = 0;
-                kevent->name = NULL;
-        }
-        return kevent;
-}
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-        return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-/*
- * inotify_dev_get_last_event - return the last event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_last_event(struct inotify_device *dev)
-{
-        if (list_empty(&dev->events))
-                return NULL;
-        return list_entry(dev->events.prev, struct inotify_kernel_event, list);
-}
-/*
- * inotify_dev_queue_event - event handler registered with core inotify, adds
- * a new event to the given device
- *
- * Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-                                    u32 cookie, const char *name,
-                                    struct inode *ignored)
-{
-        struct inotify_user_watch *watch;
-        struct inotify_device *dev;
-        struct inotify_kernel_event *kevent, *last;
-        watch = container_of(w, struct inotify_user_watch, wdata);
-        dev = watch->dev;
-        mutex_lock(&dev->ev_mutex);
-        /* we can safely put the watch as we don't reference it while
-         * generating the event
-         */
-        if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
-                put_inotify_watch(w); /* final put */
-        /* coalescing: drop this event if it is a dupe of the previous */
-        last = inotify_dev_get_last_event(dev);
-        if (last && last->event.mask == mask && last->event.wd == wd &&
-                        last->event.cookie == cookie) {
-                const char *lastname = last->name;
-                if (!name && !lastname)
-                        goto out;
-                if (name && lastname && !strcmp(lastname, name))
-                        goto out;
-        }
-        /* the queue overflowed and we already sent the Q_OVERFLOW event */
-        if (unlikely(dev->event_count > dev->max_events))
-                goto out;
-        /* if the queue overflows, we need to notify user space */
-        if (unlikely(dev->event_count == dev->max_events))
-                kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-        else
-                kevent = kernel_event(wd, mask, cookie, name);
-        if (unlikely(!kevent))
-                goto out;
-        /* queue the event and wake up anyone waiting */
-        dev->event_count++;
-        dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-        list_add_tail(&kevent->list, &dev->events);
-        wake_up_interruptible(&dev->wq);
-        kill_fasync(&dev->fa, SIGIO, POLL_IN);
-out:
-        mutex_unlock(&dev->ev_mutex);
-}
-/*
- * remove_kevent - cleans up the given kevent
- *
- * Caller must hold dev->ev_mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-                          struct inotify_kernel_event *kevent)
 {
-        list_del(&kevent->list);
+        __u32 mask;
-        dev->event_count--;
+        /* everything should accept their own ignored and cares about children */
-        dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+        mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
-}
-/*
+        /* mask off the flags used to open the fd */
- * free_kevent - frees the given kevent.
+        mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
- */
-static void free_kevent(struct inotify_kernel_event *kevent)
-{
-        kfree(kevent->name);
-        kmem_cache_free(event_cachep, kevent);
-}
-/*
+        return mask;
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->ev_mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-        if (!list_empty(&dev->events)) {
-                struct inotify_kernel_event *kevent;
-                kevent = inotify_dev_get_event(dev);
-                remove_kevent(dev, kevent);
-                free_kevent(kevent);
-        }
 }
-/*
+static inline u32 inotify_mask_to_arg(__u32 mask)
- * find_inode - resolve a user-given path to a specific inode
- */
-static int find_inode(const char __user *dirname, struct path *path,
-                      unsigned flags)
 {
-        int error;
+        return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
+                       IN_Q_OVERFLOW);
-        error = user_path_at(AT_FDCWD, dirname, flags, path);
-        if (error)
-                return error;
-        /* you can only watch an inode if you have read permissions on it */
-        error = inode_permission(path->dentry->d_inode, MAY_READ);
-        if (error)
-                path_put(path);
-        return error;
 }
-/*
+/* intofiy userspace file descriptor functions */
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->up_mutex.
- */
-static int create_watch(struct inotify_device *dev, struct inode *inode,
-                        u32 mask)
-{
-        struct inotify_user_watch *watch;
-        int ret;
-        if (atomic_read(&dev->user->inotify_watches) >=
-                        inotify_max_user_watches)
-                return -ENOSPC;
-        watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-        if (unlikely(!watch))
-                return -ENOMEM;
-        /* save a reference to device and bump the count to make it official */
-        get_inotify_dev(dev);
-        watch->dev = dev;
-        atomic_inc(&dev->user->inotify_watches);
-        inotify_init_watch(&watch->wdata);
-        ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
-        if (ret < 0)
-                free_inotify_user_watch(&watch->wdata);
-        return ret;
-}
-/* Device Interface */
 static unsigned int inotify_poll(struct file *file, poll_table *wait)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
        int ret = 0;
-        poll_wait(file, &dev->wq, wait);
+        poll_wait(file, &group->notification_waitq, wait);
-        mutex_lock(&dev->ev_mutex);
+        mutex_lock(&group->notification_mutex);
-        if (!list_empty(&dev->events))
+        if (!fsnotify_notify_queue_is_empty(group))
                ret = POLLIN | POLLRDNORM;
-        mutex_unlock(&dev->ev_mutex);
+        mutex_unlock(&group->notification_mutex);
        return ret;
 }
@@ -432,26 +141,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 * enough to fit in "count". Return an error pointer if
 * not large enough.
 *
- * Called with the device ev_mutex held.
+ * Called with the group->notification_mutex held.
 */
-static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
-                                                  size_t count)
+                                            size_t count)
 {
        size_t event_size = sizeof(struct inotify_event);
-        struct inotify_kernel_event *kevent;
+        struct fsnotify_event *event;
-        if (list_empty(&dev->events))
+        if (fsnotify_notify_queue_is_empty(group))
                return NULL;
-        kevent = inotify_dev_get_event(dev);
+        event = fsnotify_peek_notify_event(group);
-        if (kevent->name)
-                event_size += kevent->event.len;
+        event_size += roundup(event->name_len, event_size);
        if (event_size > count)
                return ERR_PTR(-EINVAL);
-        remove_kevent(dev, kevent);
+        /* held the notification_mutex the whole time, so this is the
-        return kevent;
+         * same event we peeked above */
+        fsnotify_remove_notify_event(group);
+        return event;
 }
 /*
@@ -460,51 +172,92 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
 * We already checked that the event size is smaller than the
 * buffer we had in "get_one_event()" above.
 */
-static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+                                  struct fsnotify_event *event,
                                  char __user *buf)
 {
+        struct inotify_event inotify_event;
+        struct fsnotify_event_private_data *fsn_priv;
+        struct inotify_event_private_data *priv;
        size_t event_size = sizeof(struct inotify_event);
+        size_t name_len;
+        /* we get the inotify watch descriptor from the event private data */
+        spin_lock(&event->lock);
+        fsn_priv = fsnotify_remove_priv_from_event(group, event);
+        spin_unlock(&event->lock);
+        if (!fsn_priv)
+                inotify_event.wd = -1;
+        else {
+                priv = container_of(fsn_priv, struct inotify_event_private_data,
+                                    fsnotify_event_priv_data);
+                inotify_event.wd = priv->wd;
+                inotify_free_event_priv(fsn_priv);
+        }
+        /* round up event->name_len so it is a multiple of event_size
+         * plus an extra byte for the terminating '\0'.
+         */
+        name_len = roundup(event->name_len + 1, event_size);
+        inotify_event.len = name_len;
+        inotify_event.mask = inotify_mask_to_arg(event->mask);
+        inotify_event.cookie = event->sync_cookie;
-        if (copy_to_user(buf, &kevent->event, event_size))
+        /* send the main event */
+        if (copy_to_user(buf, &inotify_event, event_size))
                return -EFAULT;
-        if (kevent->name) {
+        buf += event_size;
-                buf += event_size;
-                if (copy_to_user(buf, kevent->name, kevent->event.len))
+        /*
+         * fsnotify only stores the pathname, so here we have to send the pathname
+         * and then pad that pathname out to a multiple of sizeof(inotify_event)
+         * with zeros.  I get my zeros from the nul_inotify_event.
+         */
+        if (name_len) {
+                unsigned int len_to_zero = name_len - event->name_len;
+                /* copy the path name */
+                if (copy_to_user(buf, event->file_name, event->name_len))
                        return -EFAULT;
+                buf += event->name_len;
-                event_size += kevent->event.len;
+                /* fill userspace with 0's */
+                if (clear_user(buf, len_to_zero))
+                        return -EFAULT;
+                buf += len_to_zero;
+                event_size += name_len;
        }
        return event_size;
 }
 static ssize_t inotify_read(struct file *file, char __user *buf,
                            size_t count, loff_t *pos)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
+        struct fsnotify_event *kevent;
        char __user *start;
        int ret;
        DEFINE_WAIT(wait);
        start = buf;
-        dev = file->private_data;
+        group = file->private_data;
        while (1) {
-                struct inotify_kernel_event *kevent;
+                prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
-                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+                mutex_lock(&group->notification_mutex);
+                kevent = get_one_event(group, count);
-                mutex_lock(&dev->ev_mutex);
+                mutex_unlock(&group->notification_mutex);
-                kevent = get_one_event(dev, count);
-                mutex_unlock(&dev->ev_mutex);
                if (kevent) {
                        ret = PTR_ERR(kevent);
                        if (IS_ERR(kevent))
                                break;
-                        ret = copy_event_to_user(kevent, buf);
+                        ret = copy_event_to_user(group, kevent, buf);
-                        free_kevent(kevent);
+                        fsnotify_put_event(kevent);
                        if (ret < 0)
                                break;
                        buf += ret;
@@ -525,7 +278,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                schedule();
        }
-        finish_wait(&dev->wq, &wait);
+        finish_wait(&group->notification_waitq, &wait);
        if (start != buf && ret != -EFAULT)
                ret = buf - start;
        return ret;
@@ -533,25 +286,22 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 static int inotify_fasync(int fd, struct file *file, int on)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
-        return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
+        return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
 }
 static int inotify_release(struct inode *ignored, struct file *file)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
+        struct user_struct *user = group->inotify_data.user;
-        inotify_destroy(dev->ih);
+        fsnotify_clear_marks_by_group(group);
-        /* destroy all of the events on this device */
+        /* free this group, matching get was inotify_init->fsnotify_obtain_group */
-        mutex_lock(&dev->ev_mutex);
+        fsnotify_put_group(group);
-        while (!list_empty(&dev->events))
-                inotify_dev_event_dequeue(dev);
-        mutex_unlock(&dev->ev_mutex);
-        /* free this device: the put matching the get in inotify_init() */
+        atomic_dec(&user->inotify_devs);
-        put_inotify_dev(dev);
        return 0;
 }
@@ -559,16 +309,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
 static long inotify_ioctl(struct file *file, unsigned int cmd,
                          unsigned long arg)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
+        struct fsnotify_event_holder *holder;
+        struct fsnotify_event *event;
        void __user *p;
        int ret = -ENOTTY;
+        size_t send_len = 0;
-        dev = file->private_data;
+        group = file->private_data;
        p = (void __user *) arg;
        switch (cmd) {
        case FIONREAD:
-                ret = put_user(dev->queue_size, (int __user *) p);
+                mutex_lock(&group->notification_mutex);
+                list_for_each_entry(holder, &group->notification_list, event_list) {
+                        event = holder->event;
+                        send_len += sizeof(struct inotify_event);
+                        send_len += roundup(event->name_len,
+                                             sizeof(struct inotify_event));
+                }
+                mutex_unlock(&group->notification_mutex);
+                ret = put_user(send_len, (int __user *) p);
                break;
        }
@@ -576,23 +337,311 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 static const struct file_operations inotify_fops = {
-        .poll           = inotify_poll,
+        .poll           = inotify_poll,
-        .read           = inotify_read,
+        .read           = inotify_read,
-        .fasync         = inotify_fasync,
+        .fasync         = inotify_fasync,
-        .release        = inotify_release,
+        .release        = inotify_release,
-        .unlocked_ioctl = inotify_ioctl,
+        .unlocked_ioctl = inotify_ioctl,
        .compat_ioctl   = inotify_ioctl,
 };
-static const struct inotify_operations inotify_user_ops = {
-        .handle_event   = inotify_dev_queue_event,
-        .destroy_watch  = free_inotify_user_watch,
-};
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+{
+        int error;
+        error = user_path_at(AT_FDCWD, dirname, flags, path);
+        if (error)
+                return error;
+        /* you can only watch an inode if you have read permissions on it */
+        error = inode_permission(path->dentry->d_inode, MAY_READ);
+        if (error)
+                path_put(path);
+        return error;
+}
+/*
+ * Remove the mark from the idr (if present) and drop the reference
+ * on the mark because it was in the idr.
+ */
+static void inotify_remove_from_idr(struct fsnotify_group *group,
+                                    struct inotify_inode_mark_entry *ientry)
+{
+        struct idr *idr;
+        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark_entry *found_ientry;
+        int wd;
+        spin_lock(&group->inotify_data.idr_lock);
+        idr = &group->inotify_data.idr;
+        wd = ientry->wd;
+        if (wd == -1)
+                goto out;
+        entry = idr_find(&group->inotify_data.idr, wd);
+        if (unlikely(!entry))
+                goto out;
+        found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        if (unlikely(found_ientry != ientry)) {
+                /* We found an entry in the idr with the right wd, but it's
+                 * not the entry we were told to remove.  eparis seriously
+                 * fucked up somewhere. */
+                WARN_ON(1);
+                ientry->wd = -1;
+                goto out;
+        }
+        /* One ref for being in the idr, one ref held by the caller */
+        BUG_ON(atomic_read(&entry->refcnt) < 2);
+        idr_remove(idr, wd);
+        ientry->wd = -1;
+        /* removed from the idr, drop that ref */
+        fsnotify_put_mark(entry);
+out:
+        spin_unlock(&group->inotify_data.idr_lock);
+}
+/*
+ * Send IN_IGNORED for this wd, remove this wd from the idr.
+ */
+void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+                                    struct fsnotify_group *group)
+{
+        struct inotify_inode_mark_entry *ientry;
+        struct fsnotify_event *ignored_event;
+        struct inotify_event_private_data *event_priv;
+        struct fsnotify_event_private_data *fsn_event_priv;
+        int ret;
+        ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
+                                              FSNOTIFY_EVENT_NONE, NULL, 0,
+                                              GFP_NOFS);
+        if (!ignored_event)
+                return;
+        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
+        if (unlikely(!event_priv))
+                goto skip_send_ignore;
+        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event_priv->group = group;
+        event_priv->wd = ientry->wd;
+        ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
+        if (ret)
+                inotify_free_event_priv(fsn_event_priv);
+skip_send_ignore:
+        /* matches the reference taken when the event was created */
+        fsnotify_put_event(ignored_event);
+        /* remove this entry from the idr */
+        inotify_remove_from_idr(group, ientry);
+        atomic_dec(&group->inotify_data.user->inotify_watches);
+}
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+        struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+        kmem_cache_free(inotify_inode_mark_cachep, ientry);
+}
+static int inotify_update_existing_watch(struct fsnotify_group *group,
+                                         struct inode *inode,
+                                         u32 arg)
+{
+        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark_entry *ientry;
+        __u32 old_mask, new_mask;
+        __u32 mask;
+        int add = (arg & IN_MASK_ADD);
+        int ret;
+        /* don't allow invalid bits: we don't want flags set */
+        mask = inotify_arg_to_mask(arg);
+        if (unlikely(!mask))
+                return -EINVAL;
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        if (!entry)
+                return -ENOENT;
+        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        spin_lock(&entry->lock);
+        old_mask = entry->mask;
+        if (add) {
+                entry->mask |= mask;
+                new_mask = entry->mask;
+        } else {
+                entry->mask = mask;
+                new_mask = entry->mask;
+        }
+        spin_unlock(&entry->lock);
+        if (old_mask != new_mask) {
+                /* more bits in old than in new? */
+                int dropped = (old_mask & ~new_mask);
+                /* more bits in this entry than the inode's mask? */
+                int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+                /* more bits in this entry than the group? */
+                int do_group = (new_mask & ~group->mask);
+                /* update the inode with this new entry */
+                if (dropped || do_inode)
+                        fsnotify_recalc_inode_mask(inode);
+                /* update the group mask with the new mask */
+                if (dropped || do_group)
+                        fsnotify_recalc_group_mask(group);
+        }
+        /* return the wd */
+        ret = ientry->wd;
+        /* match the get from fsnotify_find_mark_entry() */
+        fsnotify_put_mark(entry);
+        return ret;
+}
+static int inotify_new_watch(struct fsnotify_group *group,
+                             struct inode *inode,
+                             u32 arg)
+{
+        struct inotify_inode_mark_entry *tmp_ientry;
+        __u32 mask;
+        int ret;
+        /* don't allow invalid bits: we don't want flags set */
+        mask = inotify_arg_to_mask(arg);
+        if (unlikely(!mask))
+                return -EINVAL;
+        tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+        if (unlikely(!tmp_ientry))
+                return -ENOMEM;
+        fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
+        tmp_ientry->fsn_entry.mask = mask;
+        tmp_ientry->wd = -1;
+        ret = -ENOSPC;
+        if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
+                goto out_err;
+retry:
+        ret = -ENOMEM;
+        if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+                goto out_err;
+        spin_lock(&group->inotify_data.idr_lock);
+        ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
+                                group->inotify_data.last_wd,
+                                &tmp_ientry->wd);
+        spin_unlock(&group->inotify_data.idr_lock);
+        if (ret) {
+                /* idr was out of memory allocate and try again */
+                if (ret == -EAGAIN)
+                        goto retry;
+                goto out_err;
+        }
+        /* we put the mark on the idr, take a reference */
+        fsnotify_get_mark(&tmp_ientry->fsn_entry);
+        /* we are on the idr, now get on the inode */
+        ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+        if (ret) {
+                /* we failed to get on the inode, get off the idr */
+                inotify_remove_from_idr(group, tmp_ientry);
+                goto out_err;
+        }
+        /* update the idr hint, who cares about races, it's just a hint */
+        group->inotify_data.last_wd = tmp_ientry->wd;
+        /* increment the number of watches the user has */
+        atomic_inc(&group->inotify_data.user->inotify_watches);
+        /* return the watch descriptor for this new entry */
+        ret = tmp_ientry->wd;
+        /* match the ref from fsnotify_init_markentry() */
+        fsnotify_put_mark(&tmp_ientry->fsn_entry);
+out_err:
+        if (ret < 0)
+                kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
+        return ret;
+}
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+        int ret = 0;
+retry:
+        /* try to update and existing watch with the new arg */
+        ret = inotify_update_existing_watch(group, inode, arg);
+        /* no mark present, try to add a new one */
+        if (ret == -ENOENT)
+                ret = inotify_new_watch(group, inode, arg);
+        /*
+         * inotify_new_watch could race with another thread which did an
+         * inotify_new_watch between the update_existing and the add watch
+         * here, go back and try to update an existing mark again.
+         */
+        if (ret == -EEXIST)
+                goto retry;
+        return ret;
+}
+static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
+{
+        struct fsnotify_group *group;
+        unsigned int grp_num;
+        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+        grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
+        group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
+        if (IS_ERR(group))
+                return group;
+        group->max_events = max_events;
+        spin_lock_init(&group->inotify_data.idr_lock);
+        idr_init(&group->inotify_data.idr);
+        group->inotify_data.last_wd = 1;
+        group->inotify_data.user = user;
+        group->inotify_data.fa = NULL;
+        return group;
+}
+/* inotify syscalls */
 SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
-        struct inotify_handle *ih;
        struct user_struct *user;
        struct file *filp;
        int fd, ret;
@@ -621,45 +670,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
                goto out_free_uid;
        }
-        dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-        if (unlikely(!dev)) {
+        group = inotify_new_group(user, inotify_max_queued_events);
-                ret = -ENOMEM;
+        if (IS_ERR(group)) {
+                ret = PTR_ERR(group);
                goto out_free_uid;
        }
-        ih = inotify_init(&inotify_user_ops);
-        if (IS_ERR(ih)) {
-                ret = PTR_ERR(ih);
-                goto out_free_dev;
-        }
-        dev->ih = ih;
-        dev->fa = NULL;
        filp->f_op = &inotify_fops;
        filp->f_path.mnt = mntget(inotify_mnt);
        filp->f_path.dentry = dget(inotify_mnt->mnt_root);
        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
        filp->f_mode = FMODE_READ;
        filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-        filp->private_data = dev;
+        filp->private_data = group;
-        INIT_LIST_HEAD(&dev->events);
-        init_waitqueue_head(&dev->wq);
-        mutex_init(&dev->ev_mutex);
-        mutex_init(&dev->up_mutex);
-        dev->event_count = 0;
-        dev->queue_size = 0;
-        dev->max_events = inotify_max_queued_events;
-        dev->user = user;
-        atomic_set(&dev->count, 0);
-        get_inotify_dev(dev);
        atomic_inc(&user->inotify_devs);
        fd_install(fd, filp);
        return fd;
-out_free_dev:
-        kfree(dev);
 out_free_uid:
        free_uid(user);
        put_filp(filp);
@@ -676,8 +707,8 @@ SYSCALL_DEFINE0(inotify_init)
 SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
                u32, mask)
 {
+        struct fsnotify_group *group;
        struct inode *inode;
-        struct inotify_device *dev;
        struct path path;
        struct file *filp;
        int ret, fput_needed;
@@ -698,20 +729,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
        if (mask & IN_ONLYDIR)
                flags |= LOOKUP_DIRECTORY;
-        ret = find_inode(pathname, &path, flags);
+        ret = inotify_find_inode(pathname, &path, flags);
-        if (unlikely(ret))
+        if (ret)
                goto fput_and_out;
-        /* inode held in place by reference to path; dev by fget on fd */
+        /* inode held in place by reference to path; group by fget on fd */
        inode = path.dentry->d_inode;
-        dev = filp->private_data;
+        group = filp->private_data;
-        mutex_lock(&dev->up_mutex);
+        /* create/update an inode mark */
-        ret = inotify_find_update_watch(dev->ih, inode, mask);
+        ret = inotify_update_watch(group, inode, mask);
-        if (ret == -ENOENT)
+        if (unlikely(ret))
-                ret = create_watch(dev, inode, mask);
+                goto path_put_and_out;
-        mutex_unlock(&dev->up_mutex);
+path_put_and_out:
        path_put(&path);
 fput_and_out:
        fput_light(filp, fput_needed);
@@ -720,9 +751,10 @@ fput_and_out:
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
+        struct fsnotify_group *group;
+        struct fsnotify_mark_entry *entry;
        struct file *filp;
-        struct inotify_device *dev;
+        int ret = 0, fput_needed;
-        int ret, fput_needed;
        filp = fget_light(fd, &fput_needed);
        if (unlikely(!filp))
@@ -734,10 +766,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
                goto out;
        }
-        dev = filp->private_data;
+        group = filp->private_data;
+        spin_lock(&group->inotify_data.idr_lock);
+        entry = idr_find(&group->inotify_data.idr, wd);
+        if (unlikely(!entry)) {
+                spin_unlock(&group->inotify_data.idr_lock);
+                ret = -EINVAL;
+                goto out;
+        }
+        fsnotify_get_mark(entry);
+        spin_unlock(&group->inotify_data.idr_lock);
-        /* we free our watch data when we get IN_IGNORED */
+        fsnotify_destroy_mark_by_entry(entry);
-        ret = inotify_rm_wd(dev->ih, wd);
+        fsnotify_put_mark(entry);
 out:
        fput_light(filp, fput_needed);
@@ -753,9 +795,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
 }
 static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
+    .name       = "inotifyfs",
-    .get_sb         = inotify_get_sb,
+    .get_sb     = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
+    .kill_sb    = kill_anon_super,
 };
 /*
@@ -775,18 +817,13 @@ static int __init inotify_user_setup(void)
        if (IS_ERR(inotify_mnt))
                panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
+        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
        inotify_max_user_watches = 8192;
-        watch_cachep = kmem_cache_create("inotify_watch_cache",
-                                         sizeof(struct inotify_user_watch),
-                                         0, SLAB_PANIC, NULL);
-        event_cachep = kmem_cache_create("inotify_event_cache",
-                                         sizeof(struct inotify_kernel_event),
-                                         0, SLAB_PANIC, NULL);
        return 0;
 }
 module_init(inotify_user_setup);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 000000000000..3816d5750dd5
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,421 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Basic idea behind the notification queue: An fsnotify group (like inotify)
+ * sends the userspace notification about events asyncronously some time after
+ * the event happened.  When inotify gets an event it will need to add that
+ * event to the group notify queue.  Since a single event might need to be on
+ * multiple group's notification queues we can't add the event directly to each
+ * queue and instead add a small "event_holder" to each queue.  This event_holder
+ * has a pointer back to the original event.  Since the majority of events are
+ * going to end up on one, and only one, notification queue we embed one
+ * event_holder into each event.  This means we have a single allocation instead
+ * of always needing two.  If the embedded event_holder is already in use by
+ * another group a new event_holder (from fsnotify_event_holder_cachep) will be
+ * allocated and used.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+static struct kmem_cache *fsnotify_event_cachep;
+static struct kmem_cache *fsnotify_event_holder_cachep;
+/*
+ * This is a magic event we send when the q is too full.  Since it doesn't
+ * hold real event information we just keep one system wide and use it any time
+ * it is needed.  It's refcnt is set 1 at kernel init time and will never
+ * get set to 0 so it will never get 'freed'
+ */
+static struct fsnotify_event q_overflow_event;
+static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
+/**
+ * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
+ * Called from fsnotify_move, which is inlined into filesystem modules.
+ */
+u32 fsnotify_get_cookie(void)
+{
+        return atomic_inc_return(&fsnotify_sync_cookie);
+}
+EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
+/* return true if the notify queue is empty, false otherwise */
+bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        return list_empty(&group->notification_list) ? true : false;
+}
+void fsnotify_get_event(struct fsnotify_event *event)
+{
+        atomic_inc(&event->refcnt);
+}
+void fsnotify_put_event(struct fsnotify_event *event)
+{
+        if (!event)
+                return;
+        if (atomic_dec_and_test(&event->refcnt)) {
+                if (event->data_type == FSNOTIFY_EVENT_PATH)
+                        path_put(&event->path);
+                BUG_ON(!list_empty(&event->private_data_list));
+                kfree(event->file_name);
+                kmem_cache_free(fsnotify_event_cachep, event);
+        }
+}
+struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
+{
+        return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
+}
+void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
+{
+        kmem_cache_free(fsnotify_event_holder_cachep, holder);
+}
+/*
+ * Find the private data that the group previously attached to this event when
+ * the group added the event to the notification queue (fsnotify_add_notify_event)
+ */
+struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+        struct fsnotify_event_private_data *lpriv;
+        struct fsnotify_event_private_data *priv = NULL;
+        assert_spin_locked(&event->lock);
+        list_for_each_entry(lpriv, &event->private_data_list, event_list) {
+                if (lpriv->group == group) {
+                        priv = lpriv;
+                        list_del(&priv->event_list);
+                        break;
+                }
+        }
+        return priv;
+}
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+        if ((old->mask == new->mask) &&
+            (old->to_tell == new->to_tell) &&
+            (old->data_type == new->data_type) &&
+            (old->name_len == new->name_len)) {
+                switch (old->data_type) {
+                case (FSNOTIFY_EVENT_INODE):
+                        /* remember, after old was put on the wait_q we aren't
+                         * allowed to look at the inode any more, only thing
+                         * left to check was if the file_name is the same */
+                        if (old->name_len &&
+                            !strcmp(old->file_name, new->file_name))
+                                return true;
+                        break;
+                case (FSNOTIFY_EVENT_PATH):
+                        if ((old->path.mnt == new->path.mnt) &&
+                            (old->path.dentry == new->path.dentry))
+                                return true;
+                        break;
+                case (FSNOTIFY_EVENT_NONE):
+                        if (old->mask & FS_Q_OVERFLOW)
+                                return true;
+                        else if (old->mask & FS_IN_IGNORED)
+                                return false;
+                        return false;
+                };
+        }
+        return false;
+}
+/*
+ * Add an event to the group notification queue.  The group can later pull this
+ * event off the queue to deal with.  If the event is successfully added to the
+ * group's notification queue, a reference is taken on event.
+ */
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+                              struct fsnotify_event_private_data *priv)
+{
+        struct fsnotify_event_holder *holder = NULL;
+        struct list_head *list = &group->notification_list;
+        struct fsnotify_event_holder *last_holder;
+        struct fsnotify_event *last_event;
+        int ret = 0;
+        /*
+         * There is one fsnotify_event_holder embedded inside each fsnotify_event.
+         * Check if we expect to be able to use that holder.  If not alloc a new
+         * holder.
+         * For the overflow event it's possible that something will use the in
+         * event holder before we get the lock so we may need to jump back and
+         * alloc a new holder, this can't happen for most events...
+         */
+        if (!list_empty(&event->holder.event_list)) {
+alloc_holder:
+                holder = fsnotify_alloc_event_holder();
+                if (!holder)
+                        return -ENOMEM;
+        }
+        mutex_lock(&group->notification_mutex);
+        if (group->q_len >= group->max_events) {
+                event = &q_overflow_event;
+                ret = -EOVERFLOW;
+                /* sorry, no private data on the overflow event */
+                priv = NULL;
+        }
+        spin_lock(&event->lock);
+        if (list_empty(&event->holder.event_list)) {
+                if (unlikely(holder))
+                        fsnotify_destroy_event_holder(holder);
+                holder = &event->holder;
+        } else if (unlikely(!holder)) {
+                /* between the time we checked above and got the lock the in
+                 * event holder was used, go back and get a new one */
+                spin_unlock(&event->lock);
+                mutex_unlock(&group->notification_mutex);
+                goto alloc_holder;
+        }
+        if (!list_empty(list)) {
+                last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+                last_event = last_holder->event;
+                if (event_compare(last_event, event)) {
+                        spin_unlock(&event->lock);
+                        mutex_unlock(&group->notification_mutex);
+                        if (holder != &event->holder)
+                                fsnotify_destroy_event_holder(holder);
+                        return -EEXIST;
+                }
+        }
+        group->q_len++;
+        holder->event = event;
+        fsnotify_get_event(event);
+        list_add_tail(&holder->event_list, list);
+        if (priv)
+                list_add_tail(&priv->event_list, &event->private_data_list);
+        spin_unlock(&event->lock);
+        mutex_unlock(&group->notification_mutex);
+        wake_up(&group->notification_waitq);
+        return ret;
+}
+/*
+ * Remove and return the first event from the notification list.  There is a
+ * reference held on this event since it was on the list.  It is the responsibility
+ * of the caller to drop this reference.
+ */
+struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_holder *holder;
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = holder->event;
+        spin_lock(&event->lock);
+        holder->event = NULL;
+        list_del_init(&holder->event_list);
+        spin_unlock(&event->lock);
+        /* event == holder means we are referenced through the in event holder */
+        if (holder != &event->holder)
+                fsnotify_destroy_event_holder(holder);
+        group->q_len--;
+        return event;
+}
+/*
+ * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ */
+struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_holder *holder;
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = holder->event;
+        return event;
+}
+/*
+ * Called when a group is being torn down to clean up any outstanding
+ * event notifications.
+ */
+void fsnotify_flush_notify(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_private_data *priv;
+        mutex_lock(&group->notification_mutex);
+        while (!fsnotify_notify_queue_is_empty(group)) {
+                event = fsnotify_remove_notify_event(group);
+                /* if they don't implement free_event_priv they better not have attached any */
+                if (group->ops->free_event_priv) {
+                        spin_lock(&event->lock);
+                        priv = fsnotify_remove_priv_from_event(group, event);
+                        spin_unlock(&event->lock);
+                        if (priv)
+                                group->ops->free_event_priv(priv);
+                }
+                fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+        }
+        mutex_unlock(&group->notification_mutex);
+}
+static void initialize_event(struct fsnotify_event *event)
+{
+        event->holder.event = NULL;
+        INIT_LIST_HEAD(&event->holder.event_list);
+        atomic_set(&event->refcnt, 1);
+        spin_lock_init(&event->lock);
+        event->path.dentry = NULL;
+        event->path.mnt = NULL;
+        event->inode = NULL;
+        event->data_type = FSNOTIFY_EVENT_NONE;
+        INIT_LIST_HEAD(&event->private_data_list);
+        event->to_tell = NULL;
+        event->file_name = NULL;
+        event->name_len = 0;
+        event->sync_cookie = 0;
+}
+/*
+ * fsnotify_create_event - Allocate a new event which will be sent to each
+ * group's handle_event function if the group was interested in this
+ * particular event.
+ *
+ * @to_tell the inode which is supposed to receive the event (sometimes a
+ *      parent of the inode to which the event happened.
+ * @mask what actually happened.
+ * @data pointer to the object which was actually affected
+ * @data_type flag indication if the data is a file, path, inode, nothing...
+ * @name the filename, if available
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+                                             int data_type, const char *name, u32 cookie,
+                                             gfp_t gfp)
+{
+        struct fsnotify_event *event;
+        event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
+        if (!event)
+                return NULL;
+        initialize_event(event);
+        if (name) {
+                event->file_name = kstrdup(name, gfp);
+                if (!event->file_name) {
+                        kmem_cache_free(fsnotify_event_cachep, event);
+                        return NULL;
+                }
+                event->name_len = strlen(event->file_name);
+        }
+        event->sync_cookie = cookie;
+        event->to_tell = to_tell;
+        switch (data_type) {
+        case FSNOTIFY_EVENT_FILE: {
+                struct file *file = data;
+                struct path *path = &file->f_path;
+                event->path.dentry = path->dentry;
+                event->path.mnt = path->mnt;
+                path_get(&event->path);
+                event->data_type = FSNOTIFY_EVENT_PATH;
+                break;
+        }
+        case FSNOTIFY_EVENT_PATH: {
+                struct path *path = data;
+                event->path.dentry = path->dentry;
+                event->path.mnt = path->mnt;
+                path_get(&event->path);
+                event->data_type = FSNOTIFY_EVENT_PATH;
+                break;
+        }
+        case FSNOTIFY_EVENT_INODE:
+                event->inode = data;
+                event->data_type = FSNOTIFY_EVENT_INODE;
+                break;
+        case FSNOTIFY_EVENT_NONE:
+                event->inode = NULL;
+                event->path.dentry = NULL;
+                event->path.mnt = NULL;
+                break;
+        default:
+                BUG();
+        }
+        event->mask = mask;
+        return event;
+}
+__init int fsnotify_notification_init(void)
+{
+        fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+        fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
+        initialize_event(&q_overflow_event);
+        q_overflow_event.mask = FS_Q_OVERFLOW;
+        return 0;
+}
+subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 82c5085559c6..9938034762cc 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
+#include <linux/log2.h>
 #include "aops.h"
 #include "attrib.h"
@@ -1570,7 +1571,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
        ntfs_debug("Index collation rule is 0x%x.",
                        le32_to_cpu(ir->collation_rule));
        ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
-        if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) {
+        if (!is_power_of_2(ni->itype.index.block_size)) {
                ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
                                "two.", ni->itype.index.block_size);
                goto unm_err_out;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index d7932e95b1fd..89b02985c054 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -26,6 +26,7 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include <linux/log2.h>
 #include "attrib.h"
 #include "aops.h"
@@ -65,7 +66,7 @@ static bool ntfs_check_restart_page_header(struct inode *vi,
                        logfile_log_page_size < NTFS_BLOCK_SIZE ||
                        logfile_system_page_size &
                        (logfile_system_page_size - 1) ||
-                        logfile_log_page_size & (logfile_log_page_size - 1)) {
+                        !is_power_of_2(logfile_log_page_size)) {
                ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
                return false;
        }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951dcd4a6..abaaa1cbf8de 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>       /* For bdev_hardsect_size(). */
+#include <linux/blkdev.h>       /* For bdev_logical_block_size(). */
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
@@ -443,6 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        ntfs_volume *vol = NTFS_SB(sb);
        ntfs_debug("Entering with remount options string: %s", opt);
+        lock_kernel();
 #ifndef NTFS_RW
        /* For read-only compiled driver, enforce read-only flag. */
        *flags |= MS_RDONLY;
@@ -466,15 +468,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                if (NVolErrors(vol)) {
                        ntfs_error(sb, "Volume has errors and is read-only%s",
                                        es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_IS_DIRTY) {
                        ntfs_error(sb, "Volume is dirty and read-only%s", es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
                        ntfs_error(sb, "Volume has been modified by chkdsk "
                                        "and is read-only%s", es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -482,11 +487,13 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                                        "(0x%x) and is read-only%s",
                                        (unsigned)le16_to_cpu(vol->vol_flags),
                                        es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
                        ntfs_error(sb, "Failed to set dirty bit in volume "
                                        "information flags%s", es);
+                        unlock_kernel();
                        return -EROFS;
                }
 #if 0
@@ -506,18 +513,21 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                        ntfs_error(sb, "Failed to empty journal $LogFile%s",
                                        es);
                        NVolSetErrors(vol);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_mark_quotas_out_of_date(vol)) {
                        ntfs_error(sb, "Failed to mark quotas out of date%s",
                                        es);
                        NVolSetErrors(vol);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_stamp_usnjrnl(vol)) {
                        ntfs_error(sb, "Failed to stamp transation log "
                                        "($UsnJrnl)%s", es);
                        NVolSetErrors(vol);
+                        unlock_kernel();
                        return -EROFS;
                }
        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -533,8 +543,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        // TODO: Deal with *flags.
-        if (!parse_options(vol, opt))
+        if (!parse_options(vol, opt)) {
+                unlock_kernel();
                return -EINVAL;
+        }
+        unlock_kernel();
        ntfs_debug("Done.");
        return 0;
 }
@@ -2246,6 +2259,9 @@ static void ntfs_put_super(struct super_block *sb)
        ntfs_volume *vol = NTFS_SB(sb);
        ntfs_debug("Entering.");
+        lock_kernel();
 #ifdef NTFS_RW
        /*
         * Commit all inodes while they are still open in case some of them
@@ -2373,39 +2389,12 @@ static void ntfs_put_super(struct super_block *sb)
                vol->mftmirr_ino = NULL;
        }
        /*
-         * If any dirty inodes are left, throw away all mft data page cache
+         * We should have no dirty inodes left, due to
-         * pages to allow a clean umount.  This should never happen any more
+         * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
-         * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+         * the underlying mft records are written out and cleaned.
-         * the underlying mft records are written out and cleaned.  If it does,
-         * happen anyway, we want to know...
         */
        ntfs_commit_inode(vol->mft_ino);
        write_inode_now(vol->mft_ino, 1);
-        if (sb_has_dirty_inodes(sb)) {
-                const char *s1, *s2;
-                mutex_lock(&vol->mft_ino->i_mutex);
-                truncate_inode_pages(vol->mft_ino->i_mapping, 0);
-                mutex_unlock(&vol->mft_ino->i_mutex);
-                write_inode_now(vol->mft_ino, 1);
-                if (sb_has_dirty_inodes(sb)) {
-                        static const char *_s1 = "inodes";
-                        static const char *_s2 = "";
-                        s1 = _s1;
-                        s2 = _s2;
-                } else {
-                        static const char *_s1 = "mft pages";
-                        static const char *_s2 = "They have been thrown "
-                                        "away.  ";
-                        s1 = _s1;
-                        s2 = _s2;
-                }
-                ntfs_error(sb, "Dirty %s found at umount time.  %sYou should "
-                                "run chkdsk.  Please email "
-                                "linux-ntfs-dev@lists.sourceforge.net and say "
-                                "that you saw this message.  Thank you.", s1,
-                                s2);
-        }
 #endif /* NTFS_RW */
        iput(vol->mft_ino);
@@ -2444,7 +2433,8 @@ static void ntfs_put_super(struct super_block *sb)
        }
        sb->s_fs_info = NULL;
        kfree(vol);
-        return;
+        unlock_kernel();
 }
 /**
@@ -2785,13 +2775,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                goto err_out_now;
        /* We support sector sizes up to the PAGE_CACHE_SIZE. */
-        if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+        if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
                if (!silent)
                        ntfs_error(sb, "Device has unsupported sector size "
                                        "(%i).  The maximum supported sector "
                                        "size on this architecture is %lu "
                                        "bytes.",
-                                        bdev_hardsect_size(sb->s_bdev),
+                                        bdev_logical_block_size(sb->s_bdev),
                                        PAGE_CACHE_SIZE);
                goto err_out_now;
        }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 678a067d9251..ab513ddaeff2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -475,6 +475,12 @@ struct ocfs2_path {
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
+static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
+                           u32 cpos);
+static void ocfs2_adjust_rightmost_records(struct inode *inode,
+                                           handle_t *handle,
+                                           struct ocfs2_path *path,
+                                           struct ocfs2_extent_rec *insert_rec);
 /*
 * Reset the actual path elements so that we can re-use the structure
 * to build another path. Generally, this involves freeing the buffer
@@ -1013,6 +1019,54 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 }
 /*
+ * Change range of the branches in the right most path according to the leaf
+ * extent block's rightmost record.
+ */
+static int ocfs2_adjust_rightmost_branch(handle_t *handle,
+                                         struct inode *inode,
+                                         struct ocfs2_extent_tree *et)
+{
+        int status;
+        struct ocfs2_path *path = NULL;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        path = ocfs2_new_path_from_et(et);
+        if (!path) {
+                status = -ENOMEM;
+                return status;
+        }
+        status = ocfs2_find_path(inode, path, UINT_MAX);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_extend_trans(handle, path_num_items(path) +
+                                    handle->h_buffer_credits);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_path(inode, handle, path);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
+        ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+out:
+        ocfs2_free_path(path);
+        return status;
+}
+/*
 * Add an entire tree branch to our inode. eb_bh is the extent block
 * to start at, if we don't want to start the branch at the dinode
 * structure.
@@ -1038,7 +1092,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list  *eb_el;
        struct ocfs2_extent_list  *el;
-        u32 new_cpos;
+        u32 new_cpos, root_end;
        mlog_entry_void();
@@ -1055,6 +1109,27 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        new_blocks = le16_to_cpu(el->l_tree_depth);
+        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+        root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
+        /*
+         * If there is a gap before the root end and the real end
+         * of the righmost leaf block, we need to remove the gap
+         * between new_cpos and root_end first so that the tree
+         * is consistent after we add a new branch(it will start
+         * from new_cpos).
+         */
+        if (root_end > new_cpos) {
+                mlog(0, "adjust the cluster end from %u to %u\n",
+                     root_end, new_cpos);
+                status = ocfs2_adjust_rightmost_branch(handle, inode, et);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
        /* allocate the number of new eb blocks we need */
        new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
                             GFP_KERNEL);
@@ -1071,9 +1146,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                goto bail;
        }
-        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
-        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
         * linked with the rest of the tree.
         * conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -1842,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
         * immediately to their right.
         */
        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
-        if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+        if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+                BUG_ON(right_child_el->l_tree_depth);
                BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
                left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
        }
@@ -2404,15 +2477,37 @@ out_ret_path:
        return ret;
 }
-static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
-                                      struct ocfs2_path *path)
+                                     int subtree_index, struct ocfs2_path *path)
 {
-        int i, idx;
+        int i, idx, ret;
        struct ocfs2_extent_rec *rec;
        struct ocfs2_extent_list *el;
        struct ocfs2_extent_block *eb;
        u32 range;
+        /*
+         * In normal tree rotation process, we will never touch the
+         * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+         * doesn't reserve the credits for them either.
+         *
+         * But we do have a special case here which will update the rightmost
+         * records for all the bh in the path.
+         * So we have to allocate extra credits and access them.
+         */
+        ret = ocfs2_extend_trans(handle,
+                                 handle->h_buffer_credits + subtree_index);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_path(inode, handle, path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
        /* Path should always be rightmost. */
        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
        BUG_ON(eb->h_next_leaf_blk != 0ULL);
@@ -2433,6 +2528,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
                ocfs2_journal_dirty(handle, path->p_node[i].bh);
        }
+out:
+        return ret;
 }
 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
@@ -2645,7 +2742,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
        if (del_right_subtree) {
                ocfs2_unlink_subtree(inode, handle, left_path, right_path,
                                     subtree_index, dealloc);
-                ocfs2_update_edge_lengths(inode, handle, left_path);
+                ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+                                                left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -2962,7 +3064,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                ocfs2_unlink_subtree(inode, handle, left_path, path,
                                     subtree_index, dealloc);
-                ocfs2_update_edge_lengths(inode, handle, left_path);
+                ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+                                                left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -6744,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
        }
        status = 0;
 bail:
+        brelse(last_eb_bh);
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b2c52b3a1484..b401654011a2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
                        mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
                        dump_stack();
+                        goto bail;
                }
                past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
@@ -894,18 +895,17 @@ struct ocfs2_write_cluster_desc {
         */
        unsigned        c_new;
        unsigned        c_unwritten;
+        unsigned        c_needs_zero;
 };
-static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
-{
-        return d->c_new || d->c_unwritten;
-}
 struct ocfs2_write_ctxt {
        /* Logical cluster position / len of write */
        u32                             w_cpos;
        u32                             w_clen;
+        /* First cluster allocated in a nonsparse extend */
+        u32                             w_first_new_cpos;
        struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
        /*
@@ -983,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                return -ENOMEM;
        wc->w_cpos = pos >> osb->s_clustersize_bits;
+        wc->w_first_new_cpos = UINT_MAX;
        cend = (pos + len - 1) >> osb->s_clustersize_bits;
        wc->w_clen = cend - wc->w_cpos + 1;
        get_bh(di_bh);
@@ -1217,20 +1218,18 @@ out:
 */
 static int ocfs2_write_cluster(struct address_space *mapping,
                               u32 phys, unsigned int unwritten,
+                               unsigned int should_zero,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct ocfs2_write_ctxt *wc, u32 cpos,
                               loff_t user_pos, unsigned user_len)
 {
-        int ret, i, new, should_zero = 0;
+        int ret, i, new;
        u64 v_blkno, p_blkno;
        struct inode *inode = mapping->host;
        struct ocfs2_extent_tree et;
        new = phys == 0 ? 1 : 0;
-        if (new || unwritten)
-                should_zero = 1;
        if (new) {
                u32 tmp_pos;
@@ -1301,7 +1300,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                if (tmpret) {
                        mlog_errno(tmpret);
                        if (ret == 0)
-                                tmpret = ret;
+                                ret = tmpret;
                }
        }
@@ -1341,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
                        local_len = osb->s_clustersize - cluster_off;
                ret = ocfs2_write_cluster(mapping, desc->c_phys,
-                                          desc->c_unwritten, data_ac, meta_ac,
+                                          desc->c_unwritten,
+                                          desc->c_needs_zero,
+                                          data_ac, meta_ac,
                                          wc, desc->c_cpos, pos, local_len);
                if (ret) {
                        mlog_errno(ret);
@@ -1391,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
                 * newly allocated cluster.
                 */
                desc = &wc->w_desc[0];
-                if (ocfs2_should_zero_cluster(desc))
+                if (desc->c_needs_zero)
                        ocfs2_figure_cluster_boundaries(osb,
                                                        desc->c_cpos,
                                                        &wc->w_target_from,
                                                        NULL);
                desc = &wc->w_desc[wc->w_clen - 1];
-                if (ocfs2_should_zero_cluster(desc))
+                if (desc->c_needs_zero)
                        ocfs2_figure_cluster_boundaries(osb,
                                                        desc->c_cpos,
                                                        NULL,
@@ -1466,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode,
                        phys++;
                }
+                /*
+                 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+                 * file that got extended.  w_first_new_cpos tells us
+                 * where the newly allocated clusters are so we can
+                 * zero them.
+                 */
+                if (desc->c_cpos >= wc->w_first_new_cpos) {
+                        BUG_ON(phys == 0);
+                        desc->c_needs_zero = 1;
+                }
                desc->c_phys = phys;
                if (phys == 0) {
                        desc->c_new = 1;
+                        desc->c_needs_zero = 1;
                        *clusters_to_alloc = *clusters_to_alloc + 1;
                }
-                if (ext_flags & OCFS2_EXT_UNWRITTEN)
+                if (ext_flags & OCFS2_EXT_UNWRITTEN) {
                        desc->c_unwritten = 1;
+                        desc->c_needs_zero = 1;
+                }
                num_clusters--;
        }
@@ -1632,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
        if (newsize <= i_size_read(inode))
                return 0;
-        ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
+        ret = ocfs2_extend_no_holes(inode, newsize, pos);
        if (ret)
                mlog_errno(ret);
+        wc->w_first_new_cpos =
+                ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
        return ret;
 }
@@ -1644,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
-        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+        int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
        unsigned int clusters_to_alloc, extents_to_split;
        struct ocfs2_write_ctxt *wc;
        struct inode *inode = mapping->host;
@@ -1722,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        }
-        ocfs2_set_target_boundaries(osb, wc, pos, len,
+        /*
-                                    clusters_to_alloc + extents_to_split);
+         * We have to zero sparse allocated clusters, unwritten extent clusters,
+         * and non-sparse clusters we just extended.  For non-sparse writes,
+         * we know zeros will only be needed in the first and/or last cluster.
+         */
+        if (clusters_to_alloc || extents_to_split ||
+            wc->w_desc[0].c_needs_zero ||
+            wc->w_desc[wc->w_clen - 1].c_needs_zero)
+                cluster_of_pages = 1;
+        else
+                cluster_of_pages = 0;
+        ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
@@ -1756,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
         * extent.
         */
        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-                                         clusters_to_alloc + extents_to_split,
+                                         cluster_of_pages, mmap_page);
-                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
                goto out_quota;
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 2a947c44e594..a1163b8b417c 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -22,6 +22,9 @@
 #include <linux/crc32.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/fs.h>
 #include <asm/byteorder.h>
 #include <cluster/masklog.h>
@@ -222,6 +225,155 @@ void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
        ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
 }
+/*
+ * Debugfs handling.
+ */
+#ifdef CONFIG_DEBUG_FS
+static int blockcheck_u64_get(void *data, u64 *val)
+{
+        *val = *(u64 *)data;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
+static struct dentry *blockcheck_debugfs_create(const char *name,
+                                                struct dentry *parent,
+                                                u64 *value)
+{
+        return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
+                                   &blockcheck_fops);
+}
+static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+        if (stats) {
+                debugfs_remove(stats->b_debug_check);
+                stats->b_debug_check = NULL;
+                debugfs_remove(stats->b_debug_failure);
+                stats->b_debug_failure = NULL;
+                debugfs_remove(stats->b_debug_recover);
+                stats->b_debug_recover = NULL;
+                debugfs_remove(stats->b_debug_dir);
+                stats->b_debug_dir = NULL;
+        }
+}
+static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+                                          struct dentry *parent)
+{
+        int rc = -EINVAL;
+        if (!stats)
+                goto out;
+        stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+        if (!stats->b_debug_dir)
+                goto out;
+        stats->b_debug_check =
+                blockcheck_debugfs_create("blocks_checked",
+                                          stats->b_debug_dir,
+                                          &stats->b_check_count);
+        stats->b_debug_failure =
+                blockcheck_debugfs_create("checksums_failed",
+                                          stats->b_debug_dir,
+                                          &stats->b_failure_count);
+        stats->b_debug_recover =
+                blockcheck_debugfs_create("ecc_recoveries",
+                                          stats->b_debug_dir,
+                                          &stats->b_recover_count);
+        if (stats->b_debug_check && stats->b_debug_failure &&
+            stats->b_debug_recover)
+                rc = 0;
+out:
+        if (rc)
+                ocfs2_blockcheck_debug_remove(stats);
+        return rc;
+}
+#else
+static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+                                                 struct dentry *parent)
+{
+        return 0;
+}
+static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+}
+#endif  /* CONFIG_DEBUG_FS */
+/* Always-called wrappers for starting and stopping the debugfs files */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+                                           struct dentry *parent)
+{
+        return ocfs2_blockcheck_debug_install(stats, parent);
+}
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
+{
+        ocfs2_blockcheck_debug_remove(stats);
+}
+static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats)
+{
+        u64 new_count;
+        if (!stats)
+                return;
+        spin_lock(&stats->b_lock);
+        stats->b_check_count++;
+        new_count = stats->b_check_count;
+        spin_unlock(&stats->b_lock);
+        if (!new_count)
+                mlog(ML_NOTICE, "Block check count has wrapped\n");
+}
+static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats)
+{
+        u64 new_count;
+        if (!stats)
+                return;
+        spin_lock(&stats->b_lock);
+        stats->b_failure_count++;
+        new_count = stats->b_failure_count;
+        spin_unlock(&stats->b_lock);
+        if (!new_count)
+                mlog(ML_NOTICE, "Checksum failure count has wrapped\n");
+}
+static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats)
+{
+        u64 new_count;
+        if (!stats)
+                return;
+        spin_lock(&stats->b_lock);
+        stats->b_recover_count++;
+        new_count = stats->b_recover_count;
+        spin_unlock(&stats->b_lock);
+        if (!new_count)
+                mlog(ML_NOTICE, "ECC recovery count has wrapped\n");
+}
+/*
+ * These are the low-level APIs for using the ocfs2_block_check structure.
+ */
 /*
 * This function generates check information for a block.
 * data is the block to be checked.  bc is a pointer to the
@@ -266,12 +418,15 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
 * Again, the data passed in should be the on-disk endian.
 */
 int ocfs2_block_check_validate(void *data, size_t blocksize,
-                               struct ocfs2_block_check *bc)
+                               struct ocfs2_block_check *bc,
+                               struct ocfs2_blockcheck_stats *stats)
 {
        int rc = 0;
        struct ocfs2_block_check check;
        u32 crc, ecc;
+        ocfs2_blockcheck_inc_check(stats);
        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
@@ -282,6 +437,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
        if (crc == check.bc_crc32e)
                goto out;
+        ocfs2_blockcheck_inc_failure(stats);
        mlog(ML_ERROR,
             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -292,8 +448,10 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
        /* And check the crc32 again */
        crc = crc32_le(~0, data, blocksize);
-        if (crc == check.bc_crc32e)
+        if (crc == check.bc_crc32e) {
+                ocfs2_blockcheck_inc_recover(stats);
                goto out;
+        }
        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -366,7 +524,8 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
 * Again, the data passed in should be the on-disk endian.
 */
 int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
-                                   struct ocfs2_block_check *bc)
+                                   struct ocfs2_block_check *bc,
+                                   struct ocfs2_blockcheck_stats *stats)
 {
        int i, rc = 0;
        struct ocfs2_block_check check;
@@ -377,6 +536,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
        if (!nr)
                return 0;
+        ocfs2_blockcheck_inc_check(stats);
        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
@@ -388,6 +549,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
        if (crc == check.bc_crc32e)
                goto out;
+        ocfs2_blockcheck_inc_failure(stats);
        mlog(ML_ERROR,
             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -416,8 +578,10 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
        /* And check the crc32 again */
        for (i = 0, crc = ~0; i < nr; i++)
                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
-        if (crc == check.bc_crc32e)
+        if (crc == check.bc_crc32e) {
+                ocfs2_blockcheck_inc_recover(stats);
                goto out;
+        }
        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -448,9 +612,11 @@ int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
                            struct ocfs2_block_check *bc)
 {
        int rc = 0;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
-        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+        if (ocfs2_meta_ecc(osb))
-                rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+                rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc,
+                                                &osb->osb_ecc_stats);
        return rc;
 }
@@ -468,9 +634,11 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
                                struct ocfs2_block_check *bc)
 {
        int rc = 0;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
-        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+        if (ocfs2_meta_ecc(osb))
-                rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+                rc = ocfs2_block_check_validate_bhs(bhs, nr, bc,
+                                                    &osb->osb_ecc_stats);
        return rc;
 }
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
index 70ec3feda32f..d4b69febf70a 100644
--- a/fs/ocfs2/blockcheck.h
+++ b/fs/ocfs2/blockcheck.h
@@ -21,6 +21,24 @@
 #define OCFS2_BLOCKCHECK_H
+/* Count errors and error correction from blockcheck.c */
+struct ocfs2_blockcheck_stats {
+        spinlock_t b_lock;
+        u64 b_check_count;      /* Number of blocks we've checked */
+        u64 b_failure_count;    /* Number of failed checksums */
+        u64 b_recover_count;    /* Number of blocks fixed by ecc */
+        /*
+         * debugfs entries, used if this is passed to
+         * ocfs2_blockcheck_stats_debugfs_install()
+         */
+        struct dentry *b_debug_dir;     /* Parent of the debugfs  files */
+        struct dentry *b_debug_check;   /* Exposes b_check_count */
+        struct dentry *b_debug_failure; /* Exposes b_failure_count */
+        struct dentry *b_debug_recover; /* Exposes b_recover_count */
+};
 /* High level block API */
 void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
                            struct ocfs2_block_check *bc);
@@ -37,11 +55,18 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
 void ocfs2_block_check_compute(void *data, size_t blocksize,
                               struct ocfs2_block_check *bc);
 int ocfs2_block_check_validate(void *data, size_t blocksize,
-                               struct ocfs2_block_check *bc);
+                               struct ocfs2_block_check *bc,
+                               struct ocfs2_blockcheck_stats *stats);
 void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
                                   struct ocfs2_block_check *bc);
 int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
-                                   struct ocfs2_block_check *bc);
+                                   struct ocfs2_block_check *bc,
+                                   struct ocfs2_blockcheck_stats *stats);
+/* Debug Initialization */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+                                           struct dentry *parent);
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
 /*
 * Hamming code functions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85eceab376..09cc25d04611 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
        bdevname(reg->hr_bdev, reg->hr_dev_name);
-        sectsize = bdev_hardsect_size(reg->hr_bdev);
+        sectsize = bdev_logical_block_size(reg->hr_bdev);
        if (sectsize != reg->hr_block_bytes) {
                mlog(ML_ERROR,
                     "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7e72a81bc2d4..696c32e50716 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -48,34 +48,33 @@
 * only emit the appropriage printk() when the caller passes in a constant
 * mask, as is almost always the case.
 *
- * All this bitmask nonsense is hidden from the /proc interface so that Joel
+ * All this bitmask nonsense is managed from the files under
- * doesn't have an aneurism.  Reading the file gives a straight forward
+ * /sys/fs/o2cb/logmask/.  Reading the files gives a straightforward
- * indication of which bits are on or off:
+ * indication of which bits are allowed (allow) or denied (off/deny).
- *      ENTRY off
+ *      ENTRY deny
- *      EXIT off
+ *      EXIT deny
 *      TCP off
 *      MSG off
 *      SOCKET off
- *      ERROR off
+ *      ERROR allow
- *      NOTICE on
+ *      NOTICE allow
 *
 * Writing changes the state of a given bit and requires a strictly formatted
 * single write() call:
 *
- *      write(fd, "ENTRY on", 8);
+ *      write(fd, "allow", 5);
 *
- * would turn the entry bit on.  "1" is also accepted in the place of "on", and
+ * Echoing allow/deny/off string into the logmask files can flip the bits
- * "off" and "0" behave as expected.
+ * on or off as expected; here is the bash script for example:
 *
- * Some trivial shell can flip all the bits on or off:
+ * log_mask="/sys/fs/o2cb/log_mask"
+ * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
+ *      echo allow >"$log_mask"/"$node"
+ * done
 *
- * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
+ * The debugfs.ocfs2 tool can also flip the bits with the -l option:
- * cat $log_mask | (
+ *
- *      while read bit status; do
+ * debugfs.ocfs2 -l TCP allow
- *              # $1 is "on" or "off", say
- *              echo "$bit $1" > $log_mask
- *      done
- * )
 */
 /* for task_struct */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9fbe849f6344..334f231a422c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -974,7 +974,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                           size_t caller_veclen, u8 target_node, int *status)
 {
-        int ret, error = 0;
+        int ret;
        struct o2net_msg *msg = NULL;
        size_t veclen, caller_bytes = 0;
        struct kvec *vec = NULL;
@@ -1015,10 +1015,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        o2net_set_nst_sock_time(&nst);
-        ret = wait_event_interruptible(nn->nn_sc_wq,
+        wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret));
-                                       o2net_tx_can_proceed(nn, &sc, &error));
-        if (!ret && error)
-                ret = error;
        if (ret)
                goto out;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431a031d..2f28b7de2c8d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -310,22 +310,19 @@ out_attach:
        return ret;
 }
-static DEFINE_SPINLOCK(dentry_list_lock);
+DEFINE_SPINLOCK(dentry_list_lock);
 /* We limit the number of dentry locks to drop in one go. We have
 * this limit so that we don't starve other users of ocfs2_wq. */
 #define DL_INODE_DROP_COUNT 64
 /* Drop inode references from dentry locks */
-void ocfs2_drop_dl_inodes(struct work_struct *work)
+static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
 {
-        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
-                                               dentry_lock_work);
        struct ocfs2_dentry_lock *dl;
-        int drop_count = DL_INODE_DROP_COUNT;
        spin_lock(&dentry_list_lock);
-        while (osb->dentry_lock_list && drop_count--) {
+        while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
                dl = osb->dentry_lock_list;
                osb->dentry_lock_list = dl->dl_next;
                spin_unlock(&dentry_list_lock);
@@ -333,11 +330,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
                kfree(dl);
                spin_lock(&dentry_list_lock);
        }
-        if (osb->dentry_lock_list)
+        spin_unlock(&dentry_list_lock);
+}
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+                                               dentry_lock_work);
+        __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
+        /*
+         * Don't queue dropping if umount is in progress. We flush the
+         * list in ocfs2_dismount_volume
+         */
+        spin_lock(&dentry_list_lock);
+        if (osb->dentry_lock_list &&
+            !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
                queue_work(ocfs2_wq, &osb->dentry_lock_work);
        spin_unlock(&dentry_list_lock);
 }
+/* Flush the whole work queue */
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
+{
+        __ocfs2_drop_dl_inodes(osb, -1);
+}
 /*
 * ocfs2_dentry_iput() and friends.
 *
@@ -368,7 +386,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
        /* We leave dropping of inode reference to ocfs2_wq as that can
         * possibly lead to inode deletion which gets tricky */
        spin_lock(&dentry_list_lock);
-        if (!osb->dentry_lock_list)
+        if (!osb->dentry_lock_list &&
+            !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
                queue_work(ocfs2_wq, &osb->dentry_lock_work);
        dl->dl_next = osb->dentry_lock_list;
        osb->dentry_lock_list = dl;
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e75f98d..f5dd1789acf1 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -49,10 +49,13 @@ struct ocfs2_dentry_lock {
 int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
                             u64 parent_blkno);
+extern spinlock_t dentry_list_lock;
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
                           struct ocfs2_dentry_lock *dl);
 void ocfs2_drop_dl_inodes(struct work_struct *work);
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
                                      int skip_unhashed);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c5752305627c..b358f3bf896d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2900,6 +2900,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        alloc = ocfs2_clusters_for_bytes(sb, bytes);
        dx_alloc = 0;
+        down_write(&oi->ip_alloc_sem);
        if (ocfs2_supports_indexed_dirs(osb)) {
                credits += ocfs2_add_dir_index_credits(sb);
@@ -2940,8 +2942,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out;
        }
-        down_write(&oi->ip_alloc_sem);
        /*
         * Prepare for worst case allocation scenario of two separate
         * extents in the unindexed tree.
@@ -2953,7 +2953,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-                goto out_sem;
+                goto out;
        }
        if (vfs_dq_alloc_space_nodirty(dir,
@@ -3172,10 +3172,8 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
-out_sem:
-        up_write(&oi->ip_alloc_sem);
 out:
+        up_write(&oi->ip_alloc_sem);
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
        if (meta_ac)
@@ -3322,11 +3320,15 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                brelse(new_bh);
                new_bh = NULL;
+                down_write(&OCFS2_I(dir)->ip_alloc_sem);
+                drop_alloc_sem = 1;
                dir_i_size = i_size_read(dir);
                credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
                goto do_extend;
        }
+        down_write(&OCFS2_I(dir)->ip_alloc_sem);
+        drop_alloc_sem = 1;
        dir_i_size = i_size_read(dir);
        mlog(0, "extending dir %llu (i_size = %lld)\n",
             (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
@@ -3370,9 +3372,6 @@ do_extend:
                credits++; /* For attaching the new dirent block to the
                            * dx_root */
-        down_write(&OCFS2_I(dir)->ip_alloc_sem);
-        drop_alloc_sem = 1;
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
@@ -3435,10 +3434,10 @@ bail_bh:
        *new_de_bh = new_bh;
        get_bh(*new_de_bh);
 bail:
-        if (drop_alloc_sem)
-                up_write(&OCFS2_I(dir)->ip_alloc_sem);
        if (handle)
                ocfs2_commit_trans(osb, handle);
+        if (drop_alloc_sem)
+                up_write(&OCFS2_I(dir)->ip_alloc_sem);
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index d07ddbe4b283..81eff8e58322 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
                     lock->ast_pending, lock->ml.type);
                BUG();
        }
-        BUG_ON(!list_empty(&lock->ast_list));
        if (lock->ast_pending)
                mlog(0, "lock has an ast getting flushed right now\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bcb9260c3735..43e6e3280569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
             dlm->name, res->lockname.len, res->lockname.name,
-             orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery",
+             orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
             send_to);
        /* send it */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index fcf879ed6930..756f5b0998e0 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
         * that still has AST's pending... */
        in_use = !list_empty(&lock->ast_list);
        spin_unlock(&dlm->ast_lock);
-        if (in_use) {
+        if (in_use && !(flags & LKM_CANCEL)) {
               mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
                    "while waiting for an ast!", res->lockname.len,
                    res->lockname.name);
@@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
-                if (master_node) {
+                if (master_node && !(flags & LKM_CANCEL)) {
                        mlog(ML_ERROR, "lockres in progress!\n");
                        spin_unlock(&res->spinlock);
                        return DLM_FORWARD;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e15fc7d50827..110bb57c46ab 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -92,6 +92,9 @@ struct ocfs2_unblock_ctl {
        enum ocfs2_unblock_action unblock_action;
 };
+/* Lockdep class keys */
+struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
 static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
                                        int new_level);
 static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
@@ -248,6 +251,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
 static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .get_osb        = ocfs2_get_dentry_osb,
        .post_unlock    = ocfs2_dentry_post_unlock,
@@ -313,9 +320,16 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
                             u32 dlm_flags);
 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
                                                     int wanted);
-static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
-                                 struct ocfs2_lock_res *lockres,
+                                   struct ocfs2_lock_res *lockres,
-                                 int level);
+                                   int level, unsigned long caller_ip);
+static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+                                        struct ocfs2_lock_res *lockres,
+                                        int level)
+{
+        __ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
+}
 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
@@ -485,6 +499,13 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
        ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
        ocfs2_init_lock_stats(res);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (type != OCFS2_LOCK_TYPE_OPEN)
+                lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
+                                 &lockdep_keys[type], 0);
+        else
+                res->l_lockdep_map.key = NULL;
+#endif
 }
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -637,6 +658,15 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
                                   &ocfs2_nfs_sync_lops, osb);
 }
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+                                            struct ocfs2_super *osb)
+{
+        ocfs2_lock_res_init_once(res);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+        ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+                                   &ocfs2_orphan_scan_lops, osb);
+}
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp)
 {
@@ -1239,11 +1269,13 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
        return ret;
 }
-static int ocfs2_cluster_lock(struct ocfs2_super *osb,
+static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
-                              struct ocfs2_lock_res *lockres,
+                                struct ocfs2_lock_res *lockres,
-                              int level,
+                                int level,
-                              u32 lkm_flags,
+                                u32 lkm_flags,
-                              int arg_flags)
+                                int arg_flags,
+                                int l_subclass,
+                                unsigned long caller_ip)
 {
        struct ocfs2_mask_waiter mw;
        int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
@@ -1386,13 +1418,37 @@ out:
        }
        ocfs2_update_lock_stats(lockres, level, &mw, ret);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (!ret && lockres->l_lockdep_map.key != NULL) {
+                if (level == DLM_LOCK_PR)
+                        rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
+                                !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+                                caller_ip);
+                else
+                        rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
+                                !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+                                caller_ip);
+        }
+#endif
        mlog_exit(ret);
        return ret;
 }
-static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
-                                 struct ocfs2_lock_res *lockres,
+                                     struct ocfs2_lock_res *lockres,
-                                 int level)
+                                     int level,
+                                     u32 lkm_flags,
+                                     int arg_flags)
+{
+        return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
+                                    0, _RET_IP_);
+}
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+                                   struct ocfs2_lock_res *lockres,
+                                   int level,
+                                   unsigned long caller_ip)
 {
        unsigned long flags;
@@ -1401,6 +1457,10 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
        ocfs2_dec_holders(lockres, level);
        ocfs2_downconvert_on_unlock(osb, lockres);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (lockres->l_lockdep_map.key != NULL)
+                rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+#endif
        mlog_exit_void();
 }
@@ -1972,7 +2032,8 @@ static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
 {
        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-        if (lvb->lvb_version == OCFS2_LVB_VERSION
+        if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
+            && lvb->lvb_version == OCFS2_LVB_VERSION
            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
                return 1;
        return 0;
@@ -2145,10 +2206,11 @@ static int ocfs2_assign_bh(struct inode *inode,
 * returns < 0 error if the callback will never be called, otherwise
 * the result of the lock will be communicated via the callback.
 */
-int ocfs2_inode_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full_nested(struct inode *inode,
-                         struct buffer_head **ret_bh,
+                                 struct buffer_head **ret_bh,
-                         int ex,
+                                 int ex,
-                         int arg_flags)
+                                 int arg_flags,
+                                 int subclass)
 {
        int status, level, acquired;
        u32 dlm_flags;
@@ -2186,7 +2248,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
        if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
                dlm_flags |= DLM_LKF_NOQUEUE;
-        status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
+        status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
+                                      arg_flags, subclass, _RET_IP_);
        if (status < 0) {
                if (status != -EAGAIN && status != -EIOCBRETRY)
                        mlog_errno(status);
@@ -2352,6 +2415,47 @@ void ocfs2_inode_unlock(struct inode *inode,
        mlog_exit_void();
 }
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
+{
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_orphan_scan_lvb *lvb;
+        int status = 0;
+        if (ocfs2_is_hard_readonly(osb))
+                return -EROFS;
+        if (ocfs2_mount_local(osb))
+                return 0;
+        lockres = &osb->osb_orphan_scan.os_lockres;
+        status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
+        if (status < 0)
+                return status;
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+            lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
+                *seqno = be32_to_cpu(lvb->lvb_os_seqno);
+        else
+                *seqno = osb->osb_orphan_scan.os_seqno + 1;
+        return status;
+}
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
+{
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_orphan_scan_lvb *lvb;
+        if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
+                lockres = &osb->osb_orphan_scan.os_lockres;
+                lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+                lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+                lvb->lvb_os_seqno = cpu_to_be32(seqno);
+                ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+        }
+}
 int ocfs2_super_lock(struct ocfs2_super *osb,
                     int ex)
 {
@@ -2842,6 +2946,7 @@ local:
        ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
        ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
        ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+        ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
        osb->cconn = conn;
@@ -2878,6 +2983,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
        ocfs2_lock_res_free(&osb->osb_super_lockres);
        ocfs2_lock_res_free(&osb->osb_rename_lockres);
        ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+        ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
        ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
        osb->cconn = NULL;
@@ -3061,6 +3167,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
        ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
        ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
        ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+        ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
 }
 int ocfs2_drop_inode_locks(struct inode *inode)
@@ -3576,7 +3683,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
        struct ocfs2_global_disk_dqinfo *gdinfo;
        int status = 0;
-        if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+        if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+            lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
                info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
                info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
                oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e1fd5721cd7f..7553836931de 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
        __be32  lvb_free_entry;
 };
+#define OCFS2_ORPHAN_LVB_VERSION 1
+struct ocfs2_orphan_scan_lvb {
+        __u8    lvb_version;
+        __u8    lvb_reserved[3];
+        __be32  lvb_os_seqno;
+};
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY        (0x01)
@@ -70,6 +78,14 @@ struct ocfs2_qinfo_lvb {
 /* don't block waiting for the downconvert thread, instead return -EAGAIN */
 #define OCFS2_LOCK_NONBLOCK             (0x04)
+/* Locking subclasses of inode cluster lock */
+enum {
+        OI_LS_NORMAL = 0,
+        OI_LS_PARENT,
+        OI_LS_RENAME1,
+        OI_LS_RENAME2,
+};
 int ocfs2_dlm_init(struct ocfs2_super *osb);
 void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
@@ -96,23 +112,32 @@ void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_inode_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level);
-int ocfs2_inode_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full_nested(struct inode *inode,
                         struct buffer_head **ret_bh,
                         int ex,
-                         int arg_flags);
+                         int arg_flags,
+                         int subclass);
 int ocfs2_inode_lock_with_page(struct inode *inode,
                              struct buffer_head **ret_bh,
                              int ex,
                              struct page *page);
+/* Variants without special locking class or flags */
+#define ocfs2_inode_lock_full(i, r, e, f)\
+                ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
+#define ocfs2_inode_lock_nested(i, b, e, s)\
+                ocfs2_inode_lock_full_nested(i, b, e, 0, s)
 /* 99% of the time we don't want to supply any additional flags --
 * those are for very specific cases only. */
-#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
 void ocfs2_inode_unlock(struct inode *inode,
                       int ex);
 int ocfs2_super_lock(struct ocfs2_super *osb,
                     int ex);
 void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
 int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
 int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c2a87c885b73..aa501d3f93f1 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -187,6 +187,9 @@ static int ocfs2_sync_file(struct file *file,
        if (err)
                goto bail;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                goto bail;
        journal = osb->journal->j_journal;
        err = jbd2_journal_force_commit(journal);
@@ -894,9 +897,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        struct buffer_head *bh = NULL;
        handle_t *handle = NULL;
-        int locked[MAXQUOTAS] = {0, 0};
+        int qtype;
-        int credits, qtype;
+        struct dquot *transfer_from[MAXQUOTAS] = { };
-        struct ocfs2_mem_dqinfo *oinfo;
+        struct dquot *transfer_to[MAXQUOTAS] = { };
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -969,30 +972,37 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
            (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                credits = OCFS2_INODE_UPDATE_CREDITS;
+                /*
+                 * Gather pointers to quota structures so that allocation /
+                 * freeing of quota structures happens here and not inside
+                 * vfs_dq_transfer() where we have problems with lock ordering
+                 */
                if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
-                        oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+                        transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
-                        status = ocfs2_lock_global_qf(oinfo, 1);
+                                                      USRQUOTA);
-                        if (status < 0)
+                        transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
+                                                        USRQUOTA);
+                        if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
+                                status = -ESRCH;
                                goto bail_unlock;
-                        credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+                        }
-                                ocfs2_calc_qdel_credits(sb, USRQUOTA);
-                        locked[USRQUOTA] = 1;
                }
                if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
-                        oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+                        transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
-                        status = ocfs2_lock_global_qf(oinfo, 1);
+                                                      GRPQUOTA);
-                        if (status < 0)
+                        transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
+                                                        GRPQUOTA);
+                        if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
+                                status = -ESRCH;
                                goto bail_unlock;
-                        credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+                        }
-                                   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
-                        locked[GRPQUOTA] = 1;
                }
-                handle = ocfs2_start_trans(osb, credits);
+                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
+                                           2 * ocfs2_quota_trans_credits(sb));
                if (IS_ERR(handle)) {
                        status = PTR_ERR(handle);
                        mlog_errno(status);
@@ -1030,12 +1040,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
-                if (!locked[qtype])
-                        continue;
-                oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
-                ocfs2_unlock_global_qf(oinfo, 1);
-        }
        ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
        if (size_change)
@@ -1043,6 +1047,12 @@ bail_unlock_rw:
 bail:
        brelse(bh);
+        /* Release quota pointers in case we acquired them */
+        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+                dqput(transfer_to[qtype]);
+                dqput(transfer_from[qtype]);
+        }
        if (!status && attr->ia_valid & ATTR_MODE) {
                status = ocfs2_acl_chmod(inode);
                if (status < 0)
@@ -1841,6 +1851,7 @@ relock:
                if (ret)
                        goto out_dio;
+                count = ocount;
                ret = generic_write_checks(file, ppos, &count,
                                           S_ISBLK(inode->i_mode));
                if (ret)
@@ -1908,8 +1919,10 @@ out_sems:
        mutex_unlock(&inode->i_mutex);
+        if (written)
+                ret = written;
        mlog_exit(ret);
-        return written ? written : ret;
+        return ret;
 }
 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
@@ -2016,7 +2029,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
                                      size_t len,
                                      unsigned int flags)
 {
-        int ret = 0;
+        int ret = 0, lock_level = 0;
        struct inode *inode = in->f_path.dentry->d_inode;
        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
@@ -2027,12 +2040,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
        /*
         * See the comment in ocfs2_file_aio_read()
         */
-        ret = ocfs2_inode_lock(inode, NULL, 0);
+        ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto bail;
        }
-        ocfs2_inode_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, lock_level);
        ret = generic_file_splice_read(in, ppos, pipe, len, flags);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 10e1fa87396a..4dc8890ba316 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -215,6 +215,8 @@ bail:
 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 {
        struct ocfs2_find_inode_args *args = opaque;
+        static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
+                                     ocfs2_file_ip_alloc_sem_key;
        mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
@@ -223,6 +225,15 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
        if (args->fi_sysfile_type != 0)
                lockdep_set_class(&inode->i_mutex,
                        &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
+        if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
+            args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
+            args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+            args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE)
+                lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+                                  &ocfs2_quota_ip_alloc_sem_key);
+        else
+                lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+                                  &ocfs2_file_ip_alloc_sem_key);
        mlog_exit(0);
        return 0;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 9fcd36dcc9a0..467b413bec21 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,7 +7,6 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
-#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a20a0f1e37fd..c48b93ac6b65 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,6 +28,8 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
 #define MLOG_MASK_PREFIX ML_JOURNAL
 #include <cluster/masklog.h>
@@ -52,6 +54,8 @@
 DEFINE_SPINLOCK(trans_inc_lock);
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
                              int node_num, int slot_num);
@@ -1841,6 +1845,134 @@ bail:
        return status;
 }
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+        unsigned long time;
+        get_random_bytes(&time, sizeof(time));
+        time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+        return msecs_to_jiffies(time);
+}
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
+ * is done to catch any orphans that are left over in orphan directories.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
+ * seconds.  It gets an EX lock on os_lockres and checks sequence number
+ * stored in LVB. If the sequence number has changed, it means some other
+ * node has done the scan.  This node skips the scan and tracks the
+ * sequence number.  If the sequence number didn't change, it means a scan
+ * hasn't happened.  The node queues a scan and increments the
+ * sequence number in the LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+        struct ocfs2_orphan_scan *os;
+        int status, i;
+        u32 seqno = 0;
+        os = &osb->osb_orphan_scan;
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+                goto out;
+        status = ocfs2_orphan_scan_lock(osb, &seqno);
+        if (status < 0) {
+                if (status != -EAGAIN)
+                        mlog_errno(status);
+                goto out;
+        }
+        /* Do no queue the tasks if the volume is being umounted */
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+                goto unlock;
+        if (os->os_seqno != seqno) {
+                os->os_seqno = seqno;
+                goto unlock;
+        }
+        for (i = 0; i < osb->max_slots; i++)
+                ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+                                                NULL);
+        /*
+         * We queued a recovery on orphan slots, increment the sequence
+         * number and update LVB so other node will skip the scan for a while
+         */
+        seqno++;
+        os->os_count++;
+        os->os_scantime = CURRENT_TIME;
+unlock:
+        ocfs2_orphan_scan_unlock(osb, seqno);
+out:
+        return;
+}
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
+void ocfs2_orphan_scan_work(struct work_struct *work)
+{
+        struct ocfs2_orphan_scan *os;
+        struct ocfs2_super *osb;
+        os = container_of(work, struct ocfs2_orphan_scan,
+                          os_orphan_scan_work.work);
+        osb = os->os_osb;
+        mutex_lock(&os->os_lock);
+        ocfs2_queue_orphan_scan(osb);
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
+                schedule_delayed_work(&os->os_orphan_scan_work,
+                                      ocfs2_orphan_scan_timeout());
+        mutex_unlock(&os->os_lock);
+}
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+        struct ocfs2_orphan_scan *os;
+        os = &osb->osb_orphan_scan;
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) {
+                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+                mutex_lock(&os->os_lock);
+                cancel_delayed_work(&os->os_orphan_scan_work);
+                mutex_unlock(&os->os_lock);
+        }
+}
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+        struct ocfs2_orphan_scan *os;
+        os = &osb->osb_orphan_scan;
+        os->os_osb = osb;
+        os->os_count = 0;
+        os->os_seqno = 0;
+        mutex_init(&os->os_lock);
+        INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+        struct ocfs2_orphan_scan *os;
+        os = &osb->osb_orphan_scan;
+        os->os_scantime = CURRENT_TIME;
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
+                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+        else {
+                atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
+                schedule_delayed_work(&os->os_orphan_scan_work,
+                                      ocfs2_orphan_scan_timeout());
+        }
+}
 struct ocfs2_orphan_filldir_priv {
        struct inode            *head;
        struct ocfs2_super      *osb;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index eb7b76331eb7..2c3222aec622 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -144,6 +144,11 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 }
 /* Exported only for the journal struct init code in super.c. Do not call. */
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
 void ocfs2_complete_recovery(struct work_struct *work);
 void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
@@ -325,20 +330,27 @@ int                  ocfs2_journal_dirty(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
 /* global quotafile inode update, data block */
-#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+                                   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
 /*
 * The two writes below can accidentally see global info dirty due
 * to set_info() quotactl so make them prepared for the writes.
 */
 /* quota data block, global info */
 /* Write to local quota file */
-#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+                              OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 /* global quota data block, local quota data block, global quota inode,
 * global quota info */
-#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+                             2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 static inline int ocfs2_quota_trans_credits(struct super_block *sb)
 {
@@ -351,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb)
        return credits;
 }
-/* Number of credits needed for removing quota structure from file */
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
-/* Number of credits needed for initialization of new quota structure */
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 33464c6b60a2..8601f934010b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -118,7 +118,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
             dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        status = ocfs2_inode_lock(dir, NULL, 0);
+        status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -636,7 +636,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
-        err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
+        err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
        if (err < 0) {
                if (err != -ENOENT)
                        mlog_errno(err);
@@ -800,7 +800,8 @@ static int ocfs2_unlink(struct inode *dir,
                return -EPERM;
        }
-        status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
+        status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
+                                         OI_LS_PARENT);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -978,7 +979,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                        inode1 = tmpinode;
                }
                /* lock id2 */
-                status = ocfs2_inode_lock(inode2, bh2, 1);
+                status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+                                                 OI_LS_RENAME1);
                if (status < 0) {
                        if (status != -ENOENT)
                                mlog_errno(status);
@@ -987,7 +989,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        }
        /* lock id1 */
-        status = ocfs2_inode_lock(inode1, bh1, 1);
+        status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
        if (status < 0) {
                /*
                 * An error return must mean that no cluster locks
@@ -1103,7 +1105,8 @@ static int ocfs2_rename(struct inode *old_dir,
         * won't have to concurrently downconvert the inode and the
         * dentry locks.
         */
-        status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
+        status = ocfs2_inode_lock_nested(old_inode, &old_inode_bh, 1,
+                                         OI_LS_PARENT);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1386281950db..39e1d5a39505 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,6 +34,7 @@
 #include <linux/workqueue.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
+#include <linux/lockdep.h>
 #ifndef CONFIG_OCFS2_COMPAT_JBD
 # include <linux/jbd2.h>
 #else
@@ -47,6 +48,9 @@
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
+/* For struct ocfs2_blockcheck_stats */
+#include "blockcheck.h"
 /* Most user visible OCFS2 inodes will have very few pieces of
 * metadata, but larger files (including bitmaps, etc) must be taken
 * into account when designing an access scheme. We allow a small
@@ -149,6 +153,25 @@ struct ocfs2_lock_res {
        unsigned int             l_lock_max_exmode;        /* Max wait for EX */
        unsigned int             l_lock_refresh;           /* Disk refreshes */
 #endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        struct lockdep_map       l_lockdep_map;
+#endif
+};
+enum ocfs2_orphan_scan_state {
+        ORPHAN_SCAN_ACTIVE,
+        ORPHAN_SCAN_INACTIVE
+};
+struct ocfs2_orphan_scan {
+        struct mutex            os_lock;
+        struct ocfs2_super      *os_osb;
+        struct ocfs2_lock_res   os_lockres;     /* lock to synchronize scans */
+        struct delayed_work     os_orphan_scan_work;
+        struct timespec         os_scantime;  /* time this node ran the scan */
+        u32                     os_count;      /* tracks node specific scans */
+        u32                     os_seqno;       /* tracks cluster wide scans */
+        atomic_t                os_state;              /* ACTIVE or INACTIVE */
 };
 struct ocfs2_dlm_debug {
@@ -201,10 +224,12 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
-#define OCFS2_OSB_SOFT_RO       0x0001
+#define OCFS2_OSB_SOFT_RO                       0x0001
-#define OCFS2_OSB_HARD_RO       0x0002
+#define OCFS2_OSB_HARD_RO                       0x0002
-#define OCFS2_OSB_ERROR_FS      0x0004
+#define OCFS2_OSB_ERROR_FS                      0x0004
-#define OCFS2_DEFAULT_ATIME_QUANTUM     60
+#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED        0x0008
+#define OCFS2_DEFAULT_ATIME_QUANTUM             60
 struct ocfs2_journal;
 struct ocfs2_slot_info;
@@ -295,6 +320,7 @@ struct ocfs2_super
        struct ocfs2_dinode *local_alloc_copy;
        struct ocfs2_quota_recovery *quota_rec;
+        struct ocfs2_blockcheck_stats osb_ecc_stats;
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
@@ -341,6 +367,8 @@ struct ocfs2_super
        unsigned int                    *osb_orphan_wipes;
        wait_queue_head_t               osb_wipe_event;
+        struct ocfs2_orphan_scan        osb_orphan_scan;
        /* used to protect metaecc calculation check of xattr. */
        spinlock_t osb_xattr_lock;
@@ -464,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
        spin_unlock(&osb->osb_lock);
 }
+static inline unsigned long  ocfs2_test_osb_flag(struct ocfs2_super *osb,
+                                                 unsigned long flag)
+{
+        unsigned long ret;
+        spin_lock(&osb->osb_lock);
+        ret = osb->osb_flags & flag;
+        spin_unlock(&osb->osb_lock);
+        return ret;
+}
 static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
                                     int hard)
 {
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index a53ce87481bf..c212cf5a2bdf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -48,6 +48,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_FLOCK,
        OCFS2_LOCK_TYPE_QINFO,
        OCFS2_LOCK_TYPE_NFS_SYNC,
+        OCFS2_LOCK_TYPE_ORPHAN_SCAN,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_NFS_SYNC:
                        c = 'Y';
                        break;
+                case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+                        c = 'P';
+                        break;
                default:
                        c = '\0';
        }
@@ -104,6 +108,8 @@ static char *ocfs2_lock_type_strings[] = {
        [OCFS2_LOCK_TYPE_OPEN] = "Open",
        [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
        [OCFS2_LOCK_TYPE_QINFO] = "Quota",
+        [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
+        [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 7365e2e08706..3fb96fcd4c81 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo {
        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
-        unsigned int dqi_syncjiff;      /* Precomputed dqi_syncms in jiffies */
        struct list_head dqi_chunk;     /* List of chunks */
        struct inode *dqi_gqinode;      /* Global quota file inode */
        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 1ed0f7c86869..44f2a5e1d042 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -23,6 +23,7 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "uptodate.h"
+#include "super.h"
 #include "quota.h"
 static struct workqueue_struct *ocfs2_quota_wq = NULL;
@@ -69,6 +70,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
        d->dqb_btime = cpu_to_le64(m->dqb_btime);
        d->dqb_itime = cpu_to_le64(m->dqb_itime);
+        d->dqb_pad1 = d->dqb_pad2 = 0;
 }
 static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
@@ -113,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
        int rc = 0;
        struct buffer_head *tmp = *bh;
+        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+                ocfs2_error(inode->i_sb,
+                            "Quota file %llu is probably corrupted! Requested "
+                            "to read block %Lu but file has size only %Lu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)v_block,
+                            (unsigned long long)i_size_read(inode));
+                return -EIO;
+        }
        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
                                    ocfs2_validate_quota_block);
        if (rc)
@@ -211,14 +222,13 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
        if (gqinode->i_size < off + len) {
-                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                loff_t rounded_end =
-                err = ocfs2_extend_no_holes(gqinode, off + len, off);
+                                ocfs2_align_bytes_to_blocks(sb, off + len);
-                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-                if (err < 0)
+                /* Space is already allocated in ocfs2_global_read_dquot() */
-                        goto out;
                err = ocfs2_simple_size_update(gqinode,
                                               oinfo->dqi_gqi_bh,
-                                               off + len);
+                                               rounded_end);
                if (err < 0)
                        goto out;
                new = 1;
@@ -234,7 +244,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
        }
        if (err) {
                mlog_errno(err);
-                return err;
+                goto out;
        }
        lock_buffer(bh);
        if (new)
@@ -342,7 +352,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
        oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
-        oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
        oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
        oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
        oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -352,7 +361,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-                           oinfo->dqi_syncjiff);
+                           msecs_to_jiffies(oinfo->dqi_syncms));
 out_err:
        mlog_exit(status);
@@ -402,13 +411,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
        return err;
 }
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        /*
+         * We may need to allocate tree blocks and a leaf block but not the
+         * root block
+         */
+        return oinfo->dqi_gi.dqi_qtree_depth;
+}
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+        /* We modify all the allocated blocks, tree root, and info block */
+        return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+                        OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
+}
 /* Read in information from global quota file and acquire a reference to it.
 * dquot_acquire() has already started the transaction and locked quota file */
 int ocfs2_global_read_dquot(struct dquot *dquot)
 {
        int err, err2, ex = 0;
-        struct ocfs2_mem_dqinfo *info =
+        struct super_block *sb = dquot->dq_sb;
-                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        struct inode *gqinode = info->dqi_gqinode;
+        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+        handle_t *handle = NULL;
        err = ocfs2_qinfo_lock(info, 0);
        if (err < 0)
@@ -419,13 +451,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
        OCFS2_DQUOT(dquot)->dq_use_count++;
        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        ocfs2_qinfo_unlock(info, 0);
        if (!dquot->dq_off) {   /* No real quota entry? */
-                /* Upgrade to exclusive lock for allocation */
-                err = ocfs2_qinfo_lock(info, 1);
-                if (err < 0)
-                        goto out_qlock;
                ex = 1;
+                /*
+                 * Add blocks to quota file before we start a transaction since
+                 * locking allocators ranks above a transaction start
+                 */
+                WARN_ON(journal_current_handle());
+                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                err = ocfs2_extend_no_holes(gqinode,
+                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+                        gqinode->i_size);
+                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                if (err < 0)
+                        goto out;
        }
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_calc_global_qinit_credits(sb, type));
+        if (IS_ERR(handle)) {
+                err = PTR_ERR(handle);
+                goto out;
+        }
+        err = ocfs2_qinfo_lock(info, ex);
+        if (err < 0)
+                goto out_trans;
        err = qtree_write_dquot(&info->dqi_gi, dquot);
        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
@@ -435,7 +487,11 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
 out_qlock:
        if (ex)
                ocfs2_qinfo_unlock(info, 1);
-        ocfs2_qinfo_unlock(info, 0);
+        else
+                ocfs2_qinfo_unlock(info, 0);
+out_trans:
+        if (handle)
+                ocfs2_commit_trans(osb, handle);
 out:
        if (err < 0)
                mlog_errno(err);
@@ -605,7 +661,7 @@ static void qsync_work_fn(struct work_struct *work)
        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-                           oinfo->dqi_syncjiff);
+                           msecs_to_jiffies(oinfo->dqi_syncms));
 }
 /*
@@ -633,20 +689,18 @@ out:
        return status;
 }
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
 {
-        struct ocfs2_mem_dqinfo *oinfo;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
-        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+        /*
-                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+         * We modify tree, leaf block, global info, local chunk header,
+         * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
-        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+         * accounts for inode update
-                return 0;
+         */
+        return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
-        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+               OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
-        /* We modify tree, leaf block, global info, local chunk header,
+               OCFS2_QINFO_WRITE_CREDITS +
-         * global and local inode */
+               OCFS2_INODE_UPDATE_CREDITS;
-        return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
-               2 * OCFS2_INODE_UPDATE_CREDITS;
 }
 static int ocfs2_release_dquot(struct dquot *dquot)
@@ -678,33 +732,10 @@ out:
        return status;
 }
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
-{
-        struct ocfs2_mem_dqinfo *oinfo;
-        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
-        struct ocfs2_dinode *lfe, *gfe;
-        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
-                return 0;
-        oinfo = sb_dqinfo(sb, type)->dqi_priv;
-        gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
-        lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
-        /* We can extend local file + global file. In local file we
-         * can modify info, chunk header block and dquot block. In
-         * global file we can modify info, tree and leaf block */
-        return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
-               ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
-               3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
-}
 static int ocfs2_acquire_dquot(struct dquot *dquot)
 {
-        handle_t *handle;
        struct ocfs2_mem_dqinfo *oinfo =
                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
-        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
        int status = 0;
        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
@@ -713,16 +744,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
        status = ocfs2_lock_global_qf(oinfo, 1);
        if (status < 0)
                goto out;
-        handle = ocfs2_start_trans(osb,
-                ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                mlog_errno(status);
-                goto out_ilock;
-        }
        status = dquot_acquire(dquot);
-        ocfs2_commit_trans(osb, handle);
-out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
        mlog_exit(status);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 07deec5e9721..bdb09cb6e1fe 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -20,6 +20,7 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "quota.h"
+#include "uptodate.h"
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -100,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        handle_t *handle;
        int status;
-        handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                                   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -444,10 +446,6 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
        mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
-        status = ocfs2_lock_global_qf(oinfo, 1);
-        if (status < 0)
-                goto out;
        list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
                chunk = rchunk->rc_chunk;
                hbh = NULL;
@@ -480,12 +478,18 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                                     type);
                                goto out_put_bh;
                        }
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_put_dquot;
+                        }
                        handle = ocfs2_start_trans(OCFS2_SB(sb),
                                                   OCFS2_QSYNC_CREDITS);
                        if (IS_ERR(handle)) {
                                status = PTR_ERR(handle);
                                mlog_errno(status);
-                                goto out_put_dquot;
+                                goto out_drop_lock;
                        }
                        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
                        spin_lock(&dq_data_lock);
@@ -523,6 +527,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 out_commit:
                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_drop_lock:
+                        ocfs2_unlock_global_qf(oinfo, 1);
 out_put_dquot:
                        dqput(dquot);
 out_put_bh:
@@ -537,8 +543,6 @@ out_put_bh:
                if (status < 0)
                        break;
        }
-        ocfs2_unlock_global_qf(oinfo, 1);
-out:
        if (status < 0)
                free_recovery_list(&(rec->r_list[type]));
        mlog_exit(status);
@@ -608,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                        goto out_bh;
                /* Mark quota file as clean if we are recovering quota file of
                 * some other node. */
-                handle = ocfs2_start_trans(osb, 1);
+                handle = ocfs2_start_trans(osb,
+                                           OCFS2_LOCAL_QINFO_WRITE_CREDITS);
                if (IS_ERR(handle)) {
                        status = PTR_ERR(handle);
                        mlog_errno(status);
@@ -655,6 +660,9 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        struct ocfs2_quota_recovery *rec;
        int locked = 0;
+        /* We don't need the lock and we have to acquire quota file locks
+         * which will later depend on this lock */
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        info->dqi_maxblimit = 0x7fffffffffffffffLL;
        info->dqi_maxilimit = 0x7fffffffffffffffLL;
        oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
@@ -733,6 +741,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
                goto out_err;
        }
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        return 0;
 out_err:
        if (oinfo) {
@@ -746,6 +755,7 @@ out_err:
                kfree(oinfo);
        }
        brelse(bh);
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        return -1;
 }
@@ -933,7 +943,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        struct ocfs2_local_disk_chunk *dchunk;
        int status;
        handle_t *handle;
-        struct buffer_head *bh = NULL;
+        struct buffer_head *bh = NULL, *dbh = NULL;
        u64 p_blkno;
        /* We are protected by dqio_sem so no locking needed */
@@ -957,32 +967,35 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
                mlog_errno(status);
                goto out;
        }
+        /* Local quota info and two new blocks we initialize */
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                        OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+                        2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        /* Initialize chunk header */
        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
                                             &p_blkno, NULL, NULL);
        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
-                goto out;
+                goto out_trans;
        }
        bh = sb_getblk(sb, p_blkno);
        if (!bh) {
                status = -ENOMEM;
                mlog_errno(status);
-                goto out;
+                goto out_trans;
        }
        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        ocfs2_set_new_buffer_uptodate(lqinode, bh);
-        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                mlog_errno(status);
-                goto out;
-        }
        status = ocfs2_journal_access_dq(handle, lqinode, bh,
-                                         OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto out_trans;
@@ -992,7 +1005,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        memset(dchunk->dqc_bitmap, 0,
               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
               OCFS2_QBLK_RESERVED_SPACE);
-        set_buffer_uptodate(bh);
        unlock_buffer(bh);
        status = ocfs2_journal_dirty(handle, bh);
        if (status < 0) {
@@ -1000,6 +1012,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
                goto out_trans;
        }
+        /* Initialize new block with structures */
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        dbh = sb_getblk(sb, p_blkno);
+        if (!dbh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out_trans;
+        }
+        ocfs2_set_new_buffer_uptodate(lqinode, dbh);
+        status = ocfs2_journal_access_dq(handle, lqinode, dbh,
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(dbh);
+        memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+        unlock_buffer(dbh);
+        status = ocfs2_journal_dirty(handle, dbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Update local quotafile info */
        oinfo->dqi_blocks += 2;
        oinfo->dqi_chunks++;
        status = ocfs2_local_write_info(sb, type);
@@ -1024,6 +1068,7 @@ out_trans:
        ocfs2_commit_trans(OCFS2_SB(sb), handle);
 out:
        brelse(bh);
+        brelse(dbh);
        kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
        return ERR_PTR(status);
 }
@@ -1041,6 +1086,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        struct ocfs2_local_disk_chunk *dchunk;
        int epb = ol_quota_entries_per_block(sb);
        unsigned int chunk_blocks;
+        struct buffer_head *bh;
+        u64 p_blkno;
        int status;
        handle_t *handle;
@@ -1068,12 +1115,49 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
                mlog_errno(status);
                goto out;
        }
-        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        /* Get buffer from the just added block */
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        bh = sb_getblk(sb, p_blkno);
+        if (!bh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        ocfs2_set_new_buffer_uptodate(lqinode, bh);
+        /* Local quota info, chunk header and the new block we initialize */
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                        OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+                        2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
                goto out;
        }
+        /* Zero created block */
+        status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                 OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(bh);
+        memset(bh->b_data, 0, sb->s_blocksize);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Update chunk header */
        status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
                                 OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
@@ -1090,6 +1174,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
                mlog_errno(status);
                goto out_trans;
        }
+        /* Update file header */
        oinfo->dqi_blocks++;
        status = ocfs2_local_write_info(sb, type);
        if (status < 0) {
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index fcd120f1493a..e49c41050264 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -17,6 +17,7 @@
 * General Public License for more details.
 */
+#include <linux/kernel.h>
 #include <linux/crc32.h>
 #include <linux/module.h>
@@ -153,7 +154,7 @@ static int status_map[] = {
 static int dlm_status_to_errno(enum dlm_status status)
 {
-        BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+        BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
        return status_map[status];
 }
@@ -236,6 +237,16 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
        return dlm_status_to_errno(lksb->lksb_o2dlm.status);
 }
+/*
+ * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
+ * contents, it will zero out the LVB.  Thus the caller can always trust
+ * the contents.
+ */
+static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+{
+        return 1;
+}
 static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
        return (void *)(lksb->lksb_o2dlm.lvb);
@@ -354,6 +365,7 @@ static struct ocfs2_stack_operations o2cb_stack_ops = {
        .dlm_lock       = o2cb_dlm_lock,
        .dlm_unlock     = o2cb_dlm_unlock,
        .lock_status    = o2cb_dlm_lock_status,
+        .lvb_valid      = o2cb_dlm_lvb_valid,
        .lock_lvb       = o2cb_dlm_lvb,
        .dump_lksb      = o2cb_dump_lksb,
 };
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 9b76d41a8ac6..ff4c798a5635 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -738,6 +738,13 @@ static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
        return lksb->lksb_fsdlm.sb_status;
 }
+static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+{
+        int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
+        return !invalid;
+}
 static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
        if (!lksb->lksb_fsdlm.sb_lvbptr)
@@ -873,6 +880,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
        .dlm_lock       = user_dlm_lock,
        .dlm_unlock     = user_dlm_unlock,
        .lock_status    = user_dlm_lock_status,
+        .lvb_valid      = user_dlm_lvb_valid,
        .lock_lvb       = user_dlm_lvb,
        .plock          = user_plock,
        .dump_lksb      = user_dlm_dump_lksb,
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 68b668b0e60a..3f2f1c45b7b6 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -6,7 +6,7 @@
 * Code which implements an OCFS2 specific interface to underlying
 * cluster stacks.
 *
- * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2007, 2009 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -271,11 +271,12 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
-/*
+int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
- * Why don't we cast to ocfs2_meta_lvb?  The "clean" answer is that we
+{
- * don't cast at the glue level.  The real answer is that the header
+        return active_stack->sp_ops->lvb_valid(lksb);
- * ordering is nigh impossible.
+}
- */
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
        return active_stack->sp_ops->lock_lvb(lksb);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index c571af375ef8..03a44d60eac9 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -186,6 +186,11 @@ struct ocfs2_stack_operations {
        int (*lock_status)(union ocfs2_dlm_lksb *lksb);
        /*
+         * Return non-zero if the LVB is valid.
+         */
+        int (*lvb_valid)(union ocfs2_dlm_lksb *lksb);
+        /*
         * Pull the lvb pointer off of the stack-specific lksb.
         */
        void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
@@ -252,6 +257,7 @@ int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
                     struct ocfs2_lock_res *astarg);
 int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8439f6b324b9..73a16d4666dc 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -923,14 +923,23 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr)
 {
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+        int ret;
        if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
                return 0;
-        if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
+        if (!buffer_jbd(bg_bh))
                return 1;
+        jbd_lock_bh_state(bg_bh);
        bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
-        return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+        if (bg)
+                ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+        else
+                ret = 1;
+        jbd_unlock_bh_state(bg_bh);
+        return ret;
 }
 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
@@ -1885,6 +1894,7 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        unsigned int tmp;
        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
        struct ocfs2_group_desc *undo_bg = NULL;
+        int cluster_bitmap = 0;
        mlog_entry_void();
@@ -1905,18 +1915,28 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        }
        if (ocfs2_is_cluster_bitmap(alloc_inode))
-                undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
+                cluster_bitmap = 1;
+        if (cluster_bitmap) {
+                jbd_lock_bh_state(group_bh);
+                undo_bg = (struct ocfs2_group_desc *)
+                                        bh2jh(group_bh)->b_committed_data;
+                BUG_ON(!undo_bg);
+        }
        tmp = num_bits;
        while(tmp--) {
                ocfs2_clear_bit((bit_off + tmp),
                                (unsigned long *) bg->bg_bitmap);
-                if (ocfs2_is_cluster_bitmap(alloc_inode))
+                if (cluster_bitmap)
                        ocfs2_set_bit(bit_off + tmp,
                                      (unsigned long *) undo_bg->bg_bitmap);
        }
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+        if (cluster_bitmap)
+                jbd_unlock_bh_state(group_bh);
        status = ocfs2_journal_dirty(handle, group_bh);
        if (status < 0)
                mlog_errno(status);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9d37e0..a3f8871d21fd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -118,15 +119,16 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_check_volume(struct ocfs2_super *osb);
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                               struct buffer_head *bh,
-                               u32 sectsize);
+                               u32 sectsize,
+                               struct ocfs2_blockcheck_stats *stats);
 static int ocfs2_initialize_super(struct super_block *sb,
                                  struct buffer_head *bh,
-                                  int sector_size);
+                                  int sector_size,
+                                  struct ocfs2_blockcheck_stats *stats);
 static int ocfs2_get_sector(struct super_block *sb,
                            struct buffer_head **bh,
                            int block,
                            int sect_size);
-static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
@@ -141,7 +143,6 @@ static const struct super_operations ocfs2_sops = {
        .clear_inode    = ocfs2_clear_inode,
        .delete_inode   = ocfs2_delete_inode,
        .sync_fs        = ocfs2_sync_fs,
-        .write_super    = ocfs2_write_super,
        .put_super      = ocfs2_put_super,
        .remount_fs     = ocfs2_remount,
        .show_options   = ocfs2_show_options,
@@ -204,10 +205,10 @@ static const match_table_t tokens = {
 #ifdef CONFIG_DEBUG_FS
 static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 {
-        int out = 0;
-        int i;
        struct ocfs2_cluster_connection *cconn = osb->cconn;
        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
+        int i, out = 0;
        out += snprintf(buf + out, len - out,
                        "%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
@@ -232,20 +233,24 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
                        "%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
                        osb->s_mount_opt, osb->s_atime_quantum);
-        out += snprintf(buf + out, len - out,
+        if (cconn) {
-                        "%10s => Stack: %s  Name: %*s  Version: %d.%d\n",
+                out += snprintf(buf + out, len - out,
-                        "Cluster",
+                                "%10s => Stack: %s  Name: %*s  "
-                        (*osb->osb_cluster_stack == '\0' ?
+                                "Version: %d.%d\n", "Cluster",
-                         "o2cb" : osb->osb_cluster_stack),
+                                (*osb->osb_cluster_stack == '\0' ?
-                        cconn->cc_namelen, cconn->cc_name,
+                                 "o2cb" : osb->osb_cluster_stack),
-                        cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
+                                cconn->cc_namelen, cconn->cc_name,
+                                cconn->cc_version.pv_major,
+                                cconn->cc_version.pv_minor);
+        }
        spin_lock(&osb->dc_task_lock);
        out += snprintf(buf + out, len - out,
                        "%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
                        "WorkSeq: %lu\n", "DownCnvt",
-                        task_pid_nr(osb->dc_task), osb->blocked_lock_count,
+                        (osb->dc_task ?  task_pid_nr(osb->dc_task) : -1),
-                        osb->dc_wake_sequence, osb->dc_work_sequence);
+                        osb->blocked_lock_count, osb->dc_wake_sequence,
+                        osb->dc_work_sequence);
        spin_unlock(&osb->dc_task_lock);
        spin_lock(&osb->osb_lock);
@@ -265,14 +270,15 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
        out += snprintf(buf + out, len - out,
                        "%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
-                        task_pid_nr(osb->commit_task), osb->osb_commit_interval,
+                        (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
+                        osb->osb_commit_interval,
                        atomic_read(&osb->needs_checkpoint));
        out += snprintf(buf + out, len - out,
-                        "%10s => State: %d  NumTxns: %d  TxnId: %lu\n",
+                        "%10s => State: %d  TxnId: %lu  NumTxns: %d\n",
                        "Journal", osb->journal->j_state,
-                        atomic_read(&osb->journal->j_num_trans),
+                        osb->journal->j_trans_id,
-                        osb->journal->j_trans_id);
+                        atomic_read(&osb->journal->j_num_trans));
        out += snprintf(buf + out, len - out,
                        "%10s => GlobalAllocs: %d  LocalAllocs: %d  "
@@ -298,9 +304,18 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
                        atomic_read(&osb->s_num_inodes_stolen));
        spin_unlock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out, "OrphanScan => ");
+        out += snprintf(buf + out, len - out, "Local: %u  Global: %u ",
+                        os->os_count, os->os_seqno);
+        out += snprintf(buf + out, len - out, " Last Scan: ");
+        if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+                out += snprintf(buf + out, len - out, "Disabled\n");
+        else
+                out += snprintf(buf + out, len - out, "%lu seconds ago\n",
+                                (get_seconds() - os->os_scantime.tv_sec));
        out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
                        "Slots", "Num", "RecoGen");
        for (i = 0; i < osb->max_slots; ++i) {
                out += snprintf(buf + out, len - out,
                                "%10s  %c %3d  %10d\n",
@@ -365,24 +380,12 @@ static struct file_operations ocfs2_osb_debug_fops = {
        .llseek =       generic_file_llseek,
 };
-/*
- * write_super and sync_fs ripped right out of ext3.
- */
-static void ocfs2_write_super(struct super_block *sb)
-{
-        if (mutex_trylock(&sb->s_lock) != 0)
-                BUG();
-        sb->s_dirt = 0;
-}
 static int ocfs2_sync_fs(struct super_block *sb, int wait)
 {
        int status;
        tid_t target;
        struct ocfs2_super *osb = OCFS2_SB(sb);
-        sb->s_dirt = 0;
        if (ocfs2_is_hard_readonly(osb))
                return -EROFS;
@@ -555,7 +558,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
         */
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
        BUILD_BUG_ON(sizeof(sector_t) != 8);
        /*
         * We might be limited by page cache size.
@@ -595,6 +598,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        struct mount_options parsed_options;
        struct ocfs2_super *osb = OCFS2_SB(sb);
+        lock_kernel();
        if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
                ret = -EINVAL;
                goto out;
@@ -698,12 +703,14 @@ unlock_osb:
                        ocfs2_set_journal_params(osb);
        }
 out:
+        unlock_kernel();
        return ret;
 }
 static int ocfs2_sb_probe(struct super_block *sb,
                          struct buffer_head **bh,
-                          int *sector_size)
+                          int *sector_size,
+                          struct ocfs2_blockcheck_stats *stats)
 {
        int status, tmpstat;
        struct ocfs1_vol_disk_hdr *hdr;
@@ -713,7 +720,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
        *bh = NULL;
        /* may be > 512 */
-        *sector_size = bdev_hardsect_size(sb->s_bdev);
+        *sector_size = bdev_logical_block_size(sb->s_bdev);
        if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
                mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
                     *sector_size, OCFS2_MAX_BLOCKSIZE);
@@ -769,7 +776,9 @@ static int ocfs2_sb_probe(struct super_block *sb,
                        goto bail;
                }
                di = (struct ocfs2_dinode *) (*bh)->b_data;
-                status = ocfs2_verify_volume(di, *bh, blksize);
+                memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+                spin_lock_init(&stats->b_lock);
+                status = ocfs2_verify_volume(di, *bh, blksize, stats);
                if (status >= 0)
                        goto bail;
                brelse(*bh);
@@ -975,6 +984,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        struct ocfs2_super *osb = NULL;
        struct buffer_head *bh = NULL;
        char nodestr[8];
+        struct ocfs2_blockcheck_stats stats;
        mlog_entry("%p, %p, %i", sb, data, silent);
@@ -984,13 +994,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        }
        /* probe for superblock */
-        status = ocfs2_sb_probe(sb, &bh, &sector_size);
+        status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
        if (status < 0) {
                mlog(ML_ERROR, "superblock probe failed!\n");
                goto read_super_error;
        }
-        status = ocfs2_initialize_super(sb, bh, sector_size);
+        status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
        osb = OCFS2_SB(sb);
        if (status < 0) {
                mlog_errno(status);
@@ -1100,6 +1110,18 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                goto read_super_error;
        }
+        if (ocfs2_meta_ecc(osb)) {
+                status = ocfs2_blockcheck_stats_debugfs_install(
+                                                &osb->osb_ecc_stats,
+                                                osb->osb_debug_root);
+                if (status) {
+                        mlog(ML_ERROR,
+                             "Unable to create blockcheck statistics "
+                             "files\n");
+                        goto read_super_error;
+                }
+        }
        status = ocfs2_mount_volume(sb);
        if (osb->root_inode)
                inode = igrab(osb->root_inode);
@@ -1160,6 +1182,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
        wake_up(&osb->osb_mount_event);
+        /* Start this when the mount is almost sure of being successful */
+        ocfs2_orphan_scan_start(osb);
        mlog_exit(status);
        return status;
@@ -1189,14 +1214,31 @@ static int ocfs2_get_sb(struct file_system_type *fs_type,
                           mnt);
 }
+static void ocfs2_kill_sb(struct super_block *sb)
+{
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        /* Failed mount? */
+        if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
+                goto out;
+        /* Prevent further queueing of inode drop events */
+        spin_lock(&dentry_list_lock);
+        ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
+        spin_unlock(&dentry_list_lock);
+        /* Wait for work to finish and/or remove it */
+        cancel_work_sync(&osb->dentry_lock_work);
+out:
+        kill_block_super(sb);
+}
 static struct file_system_type ocfs2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2",
        .get_sb         = ocfs2_get_sb, /* is this called when we mount
                                        * the fs? */
-        .kill_sb        = kill_block_super, /* set to the generic one
+        .kill_sb        = ocfs2_kill_sb,
-                                             * right now, but do we
-                                             * need to change that? */
        .fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
        .next           = NULL
 };
@@ -1550,9 +1592,13 @@ static void ocfs2_put_super(struct super_block *sb)
 {
        mlog_entry("(0x%p)\n", sb);
+        lock_kernel();
        ocfs2_sync_blockdev(sb);
        ocfs2_dismount_volume(sb, 0);
+        unlock_kernel();
        mlog_exit_void();
 }
@@ -1766,13 +1812,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
        }
        status = ocfs2_truncate_log_init(osb);
-        if (status < 0) {
+        if (status < 0)
                mlog_errno(status);
-                goto leave;
-        }
-        if (ocfs2_mount_local(osb))
-                goto leave;
 leave:
        if (unlock_super)
@@ -1796,6 +1837,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        debugfs_remove(osb->osb_ctxt);
+        /*
+         * Flush inode dropping work queue so that deletes are
+         * performed while the filesystem is still working
+         */
+        ocfs2_drop_all_dl_inodes(osb);
+        /* Orphan scan should be stopped as early as possible */
+        ocfs2_orphan_scan_stop(osb);
        ocfs2_disable_quotas(osb);
        ocfs2_shutdown_local_alloc(osb);
@@ -1839,6 +1889,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        if (osb->cconn)
                ocfs2_dlm_shutdown(osb, hangup_needed);
+        ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
        debugfs_remove(osb->osb_debug_root);
        if (hangup_needed)
@@ -1886,7 +1937,8 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
 static int ocfs2_initialize_super(struct super_block *sb,
                                  struct buffer_head *bh,
-                                  int sector_size)
+                                  int sector_size,
+                                  struct ocfs2_blockcheck_stats *stats)
 {
        int status;
        int i, cbits, bbits;
@@ -1945,11 +1997,16 @@ static int ocfs2_initialize_super(struct super_block *sb,
        atomic_set(&osb->alloc_stats.bg_allocs, 0);
        atomic_set(&osb->alloc_stats.bg_extends, 0);
+        /* Copy the blockcheck stats from the superblock probe */
+        osb->osb_ecc_stats = *stats;
        ocfs2_init_node_maps(osb);
        snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
                 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+        ocfs2_orphan_scan_init(osb);
        status = ocfs2_recovery_init(osb);
        if (status) {
                mlog(ML_ERROR, "Unable to initialize recovery state\n");
@@ -2175,7 +2232,8 @@ bail:
 */
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                               struct buffer_head *bh,
-                               u32 blksz)
+                               u32 blksz,
+                               struct ocfs2_blockcheck_stats *stats)
 {
        int status = -EAGAIN;
@@ -2188,7 +2246,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                    OCFS2_FEATURE_INCOMPAT_META_ECC) {
                        status = ocfs2_block_check_validate(bh->b_data,
                                                            bh->b_size,
-                                                            &di->i_check);
+                                                            &di->i_check,
+                                                            stats);
                        if (status)
                                goto out;
                }
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index ab713ebdd546..40e53702948c 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -50,6 +50,10 @@ static inline int is_in_system_inode_array(struct ocfs2_super *osb,
                                           int type,
                                           u32 slot);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
+#endif
 static inline int is_global_system_inode(int type)
 {
        return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
@@ -118,6 +122,21 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
                inode = NULL;
                goto bail;
        }
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        if (type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+            type == LOCAL_GROUP_QUOTA_SYSTEM_INODE ||
+            type == JOURNAL_SYSTEM_INODE) {
+                /* Ignore inode lock on these inodes as the lock does not
+                 * really belong to any process and lockdep cannot handle
+                 * that */
+                OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL;
+        } else {
+                lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres.
+                                                                l_lockdep_map,
+                                 ocfs2_system_inodes[type].si_name,
+                                 &ocfs2_sysfile_cluster_lock_key[type], 0);
+        }
+#endif
 bail:
        return inode;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 15631019dc63..d1a27cda984f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        struct ocfs2_xattr_block *xb;
        struct ocfs2_xattr_value_root *xv;
        size_t size;
-        int ret = -ENODATA, name_offset, name_len, block_off, i;
+        int ret = -ENODATA, name_offset, name_len, i;
+        int uninitialized_var(block_off);
        xs->bucket = ocfs2_xattr_bucket_new(inode);
        if (!xs->bucket) {
@@ -3154,7 +3155,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
                     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
                if (func) {
                        ret = func(inode, bucket, para);
-                        if (ret)
+                        if (ret && ret != -ERANGE)
                                mlog_errno(ret);
                        /* Fall through to bucket_relse() */
                }
@@ -3261,7 +3262,8 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
                                                  ocfs2_list_xattr_bucket,
                                                  &xl);
                if (ret) {
-                        mlog_errno(ret);
+                        if (ret != -ERANGE)
+                                mlog_errno(ret);
                        goto out;
                }
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 834b2331f6b3..d17e774eaf45 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -11,21 +11,6 @@
 #include <linux/mpage.h>
 #include "omfs.h"
-static int omfs_sync_file(struct file *file, struct dentry *dentry,
-                int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        err |= omfs_sync_inode(inode);
-        return err ? -EIO : 0;
-}
 static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
 {
        return (sbi->s_sys_blocksize - offset -
@@ -344,7 +329,7 @@ struct file_operations omfs_file_operations = {
        .aio_read = generic_file_aio_read,
        .aio_write = generic_file_aio_write,
        .mmap = generic_file_mmap,
-        .fsync = omfs_sync_file,
+        .fsync = simple_fsync,
        .splice_read = generic_file_splice_read,
 };
diff --git a/fs/open.c b/fs/open.c
index bdfbf03615a4..dd98e8076024 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -378,63 +378,63 @@ SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
 #endif
 #endif /* BITS_PER_LONG == 32 */
-SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
-        struct file *file;
+        struct inode *inode = file->f_path.dentry->d_inode;
-        struct inode *inode;
+        long ret;
-        long ret = -EINVAL;
        if (offset < 0 || len <= 0)
-                goto out;
+                return -EINVAL;
        /* Return error if mode is not supported */
-        ret = -EOPNOTSUPP;
        if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
-                goto out;
+                return -EOPNOTSUPP;
-        ret = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
        if (!(file->f_mode & FMODE_WRITE))
-                goto out_fput;
+                return -EBADF;
        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
         */
        ret = security_file_permission(file, MAY_WRITE);
        if (ret)
-                goto out_fput;
+                return ret;
-        inode = file->f_path.dentry->d_inode;
-        ret = -ESPIPE;
        if (S_ISFIFO(inode->i_mode))
-                goto out_fput;
+                return -ESPIPE;
-        ret = -ENODEV;
        /*
         * Let individual file system decide if it supports preallocation
         * for directories or not.
         */
        if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-                goto out_fput;
+                return -ENODEV;
-        ret = -EFBIG;
        /* Check for wrap through zero too */
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
-                goto out_fput;
+                return -EFBIG;
-        if (inode->i_op->fallocate)
+        if (!inode->i_op->fallocate)
-                ret = inode->i_op->fallocate(inode, mode, offset, len);
+                return -EOPNOTSUPP;
-        else
-                ret = -EOPNOTSUPP;
-out_fput:
+        return inode->i_op->fallocate(inode, mode, offset, len);
-        fput(file);
-out:
-        return ret;
 }
+SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+{
+        struct file *file;
+        int error = -EBADF;
+        file = fget(fd);
+        if (file) {
+                error = do_fallocate(file, mode, offset, len);
+                fput(file);
+        }
+        return error;
+}
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
 {
@@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
        audit_inode(NULL, dentry);
-        err = mnt_want_write(file->f_path.mnt);
+        err = mnt_want_write_file(file);
        if (err)
                goto out_putf;
        mutex_lock(&inode->i_mutex);
@@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
        if (!file)
                goto out;
-        error = mnt_want_write(file->f_path.mnt);
+        error = mnt_want_write_file(file);
        if (error)
                goto out_fput;
        dentry = file->f_path.dentry;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 99e33ef40be4..ea4e6cb29e13 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
        return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
+ssize_t part_alignment_offset_show(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
 ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
 {
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_partition.attr,
        &dev_attr_start.attr,
        &dev_attr_size.attr,
+        &dev_attr_alignment_offset.attr,
        &dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        pdev = part_to_dev(p);
        p->start_sect = start;
+        p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
@@ -426,7 +436,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        rcu_assign_pointer(ptbl->part[partno], p);
        /* suppress uevent if the disk supresses it */
-        if (!dev_get_uevent_suppress(pdev))
+        if (!dev_get_uevent_suppress(ddev))
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
        return p;
@@ -546,27 +556,49 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        /* add partitions */
        for (p = 1; p < state->limit; p++) {
-                sector_t size = state->parts[p].size;
+                sector_t size, from;
-                sector_t from = state->parts[p].from;
+try_scan:
+                size = state->parts[p].size;
                if (!size)
                        continue;
+                from = state->parts[p].from;
                if (from >= get_capacity(disk)) {
                        printk(KERN_WARNING
                               "%s: p%d ignored, start %llu is behind the end of the disk\n",
                               disk->disk_name, p, (unsigned long long) from);
                        continue;
                }
                if (from + size > get_capacity(disk)) {
-                        /*
+                        struct block_device_operations *bdops = disk->fops;
-                         * we can not ignore partitions of broken tables
+                        unsigned long long capacity;
-                         * created by for example camera firmware, but we
-                         * limit them to the end of the disk to avoid
-                         * creating invalid block devices
-                         */
                        printk(KERN_WARNING
-                               "%s: p%d size %llu limited to end of disk\n",
+                               "%s: p%d size %llu exceeds device capacity, ",
                               disk->disk_name, p, (unsigned long long) size);
-                        size = get_capacity(disk) - from;
+                        if (bdops->set_capacity &&
+                            (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
+                                printk(KERN_CONT "enabling native capacity\n");
+                                capacity = bdops->set_capacity(disk, ~0ULL);
+                                disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+                                if (capacity > get_capacity(disk)) {
+                                        set_capacity(disk, capacity);
+                                        check_disk_size_change(disk, bdev);
+                                        bdev->bd_invalidated = 0;
+                                }
+                                goto try_scan;
+                        } else {
+                                /*
+                                 * we can not ignore partitions of broken tables
+                                 * created by for example camera firmware, but
+                                 * we limit them to the end of the disk to avoid
+                                 * creating invalid block devices
+                                 */
+                                printk(KERN_CONT "limited to end of disk\n");
+                                size = get_capacity(disk) - from;
+                        }
                }
                part = add_partition(disk, p, from, size,
                                     state->parts[p].flags);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 46297683cd34..fc71aab08460 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
        Sector sect;
        res = 0;
-        blocksize = bdev_hardsect_size(bdev);
+        blocksize = bdev_logical_block_size(bdev);
        if (blocksize <= 0)
                goto out_exit;
        i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 796511886f28..0028d2ef0662 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
        Sector sect;
        unsigned char *data;
        u32 this_sector, this_size;
-        int sector_size = bdev_hardsect_size(bdev) / 512;
+        int sector_size = bdev_logical_block_size(bdev) / 512;
        int loopct = 0;         /* number of links followed
                                   without finding a data partition */
        int i;
@@ -415,7 +415,7 @@ static struct {
 
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-        int sector_size = bdev_hardsect_size(bdev) / 512;
+        int sector_size = bdev_logical_block_size(bdev) / 512;
        Sector sect;
        unsigned char *data;
        struct partition *p;
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec45b8d..52c415114838 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
        } else {
-                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+                pipe_lock_nested(pipe2, I_MUTEX_PARENT);
-                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+                pipe_lock_nested(pipe1, I_MUTEX_CHILD);
        }
 }
@@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
        return 0;
 }
+/**
+ * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
+ * @pipe:       the pipe that the buffer belongs to
+ * @buf:        the buffer to put a reference to
+ *
+ * Description:
+ *      This function releases a reference to @buf.
+ */
+void generic_pipe_buf_release(struct pipe_inode_info *pipe,
+                              struct pipe_buffer *buf)
+{
+        page_cache_release(buf->page);
+}
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
        .map = generic_pipe_buf_map,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 63d965193b22..11a7b5c68153 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -18,6 +18,7 @@ proc-y	+= meminfo.o
 proc-y  += stat.o
 proc-y  += uptime.o
 proc-y  += version.o
+proc-y  += softirqs.o
 proc-$(CONFIG_PROC_SYSCTL)      += proc_sysctl.o
 proc-$(CONFIG_NET)              += proc_net.o
 proc-$(CONFIG_PROC_KCORE)       += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3326bbf9ab95..6f742f6658a9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,23 +234,20 @@ static int check_mem_permission(struct task_struct *task)
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-        struct mm_struct *mm = get_task_mm(task);
+        struct mm_struct *mm;
-        if (!mm)
+        if (mutex_lock_killable(&task->cred_guard_mutex))
                return NULL;
-        down_read(&mm->mmap_sem);
-        task_lock(task);
+        mm = get_task_mm(task);
-        if (task->mm != mm)
+        if (mm && mm != current->mm &&
-                goto out;
+                        !ptrace_may_access(task, PTRACE_MODE_READ)) {
-        if (task->mm != current->mm &&
+                mmput(mm);
-            __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
+                mm = NULL;
-                goto out;
+        }
-        task_unlock(task);
+        mutex_unlock(&task->cred_guard_mutex);
        return mm;
-out:
-        task_unlock(task);
-        up_read(&mm->mmap_sem);
-        mmput(mm);
-        return NULL;
 }
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -2128,9 +2125,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
        if (copy_from_user(page, buf, count))
                goto out_free;
+        /* Guard against adverse ptrace interaction */
+        length = mutex_lock_interruptible(&task->cred_guard_mutex);
+        if (length < 0)
+                goto out_free;
        length = security_setprocattr(task,
                                      (char*)file->f_path.dentry->d_name.name,
                                      (void*)page, count);
+        mutex_unlock(&task->cred_guard_mutex);
 out_free:
        free_page((unsigned long) page);
 out:
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f6db9618a888..753ca37002c8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,3 +92,28 @@ struct pde_opener {
        struct list_head lh;
 };
 void pde_users_dec(struct proc_dir_entry *pde);
+extern spinlock_t proc_subdir_lock;
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+unsigned long task_vsize(struct mm_struct *);
+int task_statm(struct mm_struct *, int *, int *, int *, int *);
+void task_mem(struct seq_file *, struct mm_struct *);
+struct proc_dir_entry *de_get(struct proc_dir_entry *de);
+void de_put(struct proc_dir_entry *de);
+extern struct vfsmount *proc_mnt;
+int proc_fill_super(struct super_block *);
+struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+/*
+ * These are generic /proc routines that use the internal
+ * "struct proc_dir_entry" tree to traverse the filesystem.
+ *
+ * The /proc root directory has extended versions to take care
+ * of the /proc/<pid> subdirectories.
+ */
+int proc_readdir(struct file *, void *, filldir_t);
+struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39cf99ee..1afa4dd4cae2 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
-        int a, b, c;
+        unsigned long avnrun[3];
-        unsigned long seq;
-        do {
+        get_avenrun(avnrun, FIXED_1/200, 0);
-                seq = read_seqbegin(&xtime_lock);
-                a = avenrun[0] + (FIXED_1/200);
-                b = avenrun[1] + (FIXED_1/200);
-                c = avenrun[2] + (FIXED_1/200);
-        } while (read_seqretry(&xtime_lock, seq));
-        seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+        seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
-                LOAD_INT(a), LOAD_FRAC(a),
+                LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
-                LOAD_INT(b), LOAD_FRAC(b),
+                LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
-                LOAD_INT(c), LOAD_FRAC(c),
+                LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
                nr_running(), nr_threads,
                task_active_pid_ns(current)->last_pid);
        return 0;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c6b0302af4c4..d5c410d47fae 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                "Inactive(anon): %8lu kB\n"
                "Active(file):   %8lu kB\n"
                "Inactive(file): %8lu kB\n"
-#ifdef CONFIG_UNEVICTABLE_LRU
                "Unevictable:    %8lu kB\n"
                "Mlocked:        %8lu kB\n"
-#endif
 #ifdef CONFIG_HIGHMEM
                "HighTotal:      %8lu kB\n"
                "HighFree:       %8lu kB\n"
@@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(pages[LRU_INACTIVE_ANON]),
                K(pages[LRU_ACTIVE_FILE]),
                K(pages[LRU_INACTIVE_FILE]),
-#ifdef CONFIG_UNEVICTABLE_LRU
                K(pages[LRU_UNEVICTABLE]),
                K(global_page_state(NR_MLOCK)),
-#endif
 #ifdef CONFIG_HIGHMEM
                K(i.totalhigh),
                K(i.freehigh),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e9983837d08d..2707c6c7a20f 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -6,11 +6,13 @@
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/hugetlb.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
 /* /proc/kpagecount - an array exposing page counts
 *
 * Each entry is a u64 representing the corresponding
@@ -32,20 +34,22 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
                return -EINVAL;
        while (count > 0) {
-                ppage = NULL;
                if (pfn_valid(pfn))
                        ppage = pfn_to_page(pfn);
-                pfn++;
+                else
+                        ppage = NULL;
                if (!ppage)
                        pcount = 0;
                else
                        pcount = page_mapcount(ppage);
-                if (put_user(pcount, out++)) {
+                if (put_user(pcount, out)) {
                        ret = -EFAULT;
                        break;
                }
+                pfn++;
+                out++;
                count -= KPMSIZE;
        }
@@ -68,19 +72,122 @@ static const struct file_operations proc_kpagecount_operations = {
 /* These macros are used to decouple internal flags from exported ones */
-#define KPF_LOCKED     0
+#define KPF_LOCKED              0
-#define KPF_ERROR      1
+#define KPF_ERROR               1
-#define KPF_REFERENCED 2
+#define KPF_REFERENCED          2
-#define KPF_UPTODATE   3
+#define KPF_UPTODATE            3
-#define KPF_DIRTY      4
+#define KPF_DIRTY               4
-#define KPF_LRU        5
+#define KPF_LRU                 5
-#define KPF_ACTIVE     6
+#define KPF_ACTIVE              6
-#define KPF_SLAB       7
+#define KPF_SLAB                7
-#define KPF_WRITEBACK  8
+#define KPF_WRITEBACK           8
-#define KPF_RECLAIM    9
+#define KPF_RECLAIM             9
-#define KPF_BUDDY     10
+#define KPF_BUDDY               10
+/* 11-20: new additions in 2.6.31 */
+#define KPF_MMAP                11
+#define KPF_ANON                12
+#define KPF_SWAPCACHE           13
+#define KPF_SWAPBACKED          14
+#define KPF_COMPOUND_HEAD       15
+#define KPF_COMPOUND_TAIL       16
+#define KPF_HUGE                17
+#define KPF_UNEVICTABLE         18
+#define KPF_NOPAGE              20
+/* kernel hacking assistances
+ * WARNING: subject to change, never rely on them!
+ */
+#define KPF_RESERVED            32
+#define KPF_MLOCKED             33
+#define KPF_MAPPEDTODISK        34
+#define KPF_PRIVATE             35
+#define KPF_PRIVATE_2           36
+#define KPF_OWNER_PRIVATE       37
+#define KPF_ARCH                38
+#define KPF_UNCACHED            39
+static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
+{
+        return ((kflags >> kbit) & 1) << ubit;
+}
-#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
+static u64 get_uflags(struct page *page)
+{
+        u64 k;
+        u64 u;
+        /*
+         * pseudo flag: KPF_NOPAGE
+         * it differentiates a memory hole from a page with no flags
+         */
+        if (!page)
+                return 1 << KPF_NOPAGE;
+        k = page->flags;
+        u = 0;
+        /*
+         * pseudo flags for the well known (anonymous) memory mapped pages
+         *
+         * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
+         * simple test in page_mapped() is not enough.
+         */
+        if (!PageSlab(page) && page_mapped(page))
+                u |= 1 << KPF_MMAP;
+        if (PageAnon(page))
+                u |= 1 << KPF_ANON;
+        /*
+         * compound pages: export both head/tail info
+         * they together define a compound page's start/end pos and order
+         */
+        if (PageHead(page))
+                u |= 1 << KPF_COMPOUND_HEAD;
+        if (PageTail(page))
+                u |= 1 << KPF_COMPOUND_TAIL;
+        if (PageHuge(page))
+                u |= 1 << KPF_HUGE;
+        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
+        /*
+         * Caveats on high order pages:
+         * PG_buddy will only be set on the head page; SLUB/SLQB do the same
+         * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+         */
+        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
+        u |= kpf_copy_bit(k, KPF_BUDDY,         PG_buddy);
+        u |= kpf_copy_bit(k, KPF_ERROR,         PG_error);
+        u |= kpf_copy_bit(k, KPF_DIRTY,         PG_dirty);
+        u |= kpf_copy_bit(k, KPF_UPTODATE,      PG_uptodate);
+        u |= kpf_copy_bit(k, KPF_WRITEBACK,     PG_writeback);
+        u |= kpf_copy_bit(k, KPF_LRU,           PG_lru);
+        u |= kpf_copy_bit(k, KPF_REFERENCED,    PG_referenced);
+        u |= kpf_copy_bit(k, KPF_ACTIVE,        PG_active);
+        u |= kpf_copy_bit(k, KPF_RECLAIM,       PG_reclaim);
+        u |= kpf_copy_bit(k, KPF_SWAPCACHE,     PG_swapcache);
+        u |= kpf_copy_bit(k, KPF_SWAPBACKED,    PG_swapbacked);
+        u |= kpf_copy_bit(k, KPF_UNEVICTABLE,   PG_unevictable);
+        u |= kpf_copy_bit(k, KPF_MLOCKED,       PG_mlocked);
+#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+        u |= kpf_copy_bit(k, KPF_UNCACHED,      PG_uncached);
+#endif
+        u |= kpf_copy_bit(k, KPF_RESERVED,      PG_reserved);
+        u |= kpf_copy_bit(k, KPF_MAPPEDTODISK,  PG_mappedtodisk);
+        u |= kpf_copy_bit(k, KPF_PRIVATE,       PG_private);
+        u |= kpf_copy_bit(k, KPF_PRIVATE_2,     PG_private_2);
+        u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
+        u |= kpf_copy_bit(k, KPF_ARCH,          PG_arch_1);
+        return u;
+};
 static ssize_t kpageflags_read(struct file *file, char __user *buf,
                             size_t count, loff_t *ppos)
@@ -90,7 +197,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
        unsigned long src = *ppos;
        unsigned long pfn;
        ssize_t ret = 0;
-        u64 kflags, uflags;
        pfn = src / KPMSIZE;
        count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
@@ -98,32 +204,18 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
                return -EINVAL;
        while (count > 0) {
-                ppage = NULL;
                if (pfn_valid(pfn))
                        ppage = pfn_to_page(pfn);
-                pfn++;
-                if (!ppage)
-                        kflags = 0;
                else
-                        kflags = ppage->flags;
+                        ppage = NULL;
-                uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
+                if (put_user(get_uflags(ppage), out)) {
-                        kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
-                        kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
-                        kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
-                        kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
-                        kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
-                        kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
-                        kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
-                        kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
-                        kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
-                        kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
-                if (put_user(uflags, out++)) {
                        ret = -EFAULT;
                        break;
                }
+                pfn++;
+                out++;
                count -= KPMSIZE;
        }
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de2bba5a3440..7ba79a54948c 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 #ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
@@ -194,20 +195,20 @@ void proc_device_tree_add_node(struct device_node *np,
                        p = fixup_name(np, de, p);
                ent = proc_mkdir(p, de);
-                if (ent == 0)
+                if (ent == NULL)
                        break;
                proc_device_tree_add_node(child, ent);
        }
        of_node_put(child);
-        for (pp = np->properties; pp != 0; pp = pp->next) {
+        for (pp = np->properties; pp != NULL; pp = pp->next) {
                p = pp->name;
                if (duplicate_name(de, p))
                        p = fixup_name(np, de, p);
                ent = __proc_device_tree_add_prop(de, pp, p);
-                if (ent == 0)
+                if (ent == NULL)
                        break;
        }
 }
@@ -220,10 +221,10 @@ void __init proc_device_tree_init(void)
        struct device_node *root;
        proc_device_tree = proc_mkdir("device-tree", NULL);
-        if (proc_device_tree == 0)
+        if (proc_device_tree == NULL)
                return;
        root = of_find_node_by_path("/");
-        if (root == 0) {
+        if (root == NULL) {
                printk(KERN_ERR "/proc/device-tree: can't find root\n");
                return;
        }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
new file mode 100644
index 000000000000..1807c2419f17
--- /dev/null
+++ b/fs/proc/softirqs.c
@@ -0,0 +1,44 @@
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+/*
+ * /proc/softirqs  ... display the number of softirqs
+ */
+static int show_softirqs(struct seq_file *p, void *v)
+{
+        int i, j;
+        seq_printf(p, "                ");
+        for_each_possible_cpu(i)
+                seq_printf(p, "CPU%-8d", i);
+        seq_printf(p, "\n");
+        for (i = 0; i < NR_SOFTIRQS; i++) {
+                seq_printf(p, "%8s:", softirq_to_name[i]);
+                for_each_possible_cpu(j)
+                        seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+                seq_printf(p, "\n");
+        }
+        return 0;
+}
+static int softirqs_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_softirqs, NULL);
+}
+static const struct file_operations proc_softirqs_operations = {
+        .open           = softirqs_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int __init proc_softirqs_init(void)
+{
+        proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+        return 0;
+}
+module_init(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81e4eb60972e..7cc726c6d70a 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -29,6 +29,8 @@ static int show_stat(struct seq_file *p, void *v)
        cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
        cputime64_t guest;
        u64 sum = 0;
+        u64 sum_softirq = 0;
+        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
        struct timespec boottime;
        unsigned int per_irq_sum;
@@ -53,6 +55,13 @@ static int show_stat(struct seq_file *p, void *v)
                        sum += kstat_irqs_cpu(j, i);
                }
                sum += arch_irq_stat_cpu(i);
+                for (j = 0; j < NR_SOFTIRQS; j++) {
+                        unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
+                        per_softirq_sums[j] += softirq_stat;
+                        sum_softirq += softirq_stat;
+                }
        }
        sum += arch_irq_stat();
@@ -115,6 +124,12 @@ static int show_stat(struct seq_file *p, void *v)
                nr_running(),
                nr_iowait());
+        seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+        for (i = 0; i < NR_SOFTIRQS; i++)
+                seq_printf(p, " %u", per_softirq_sums[i]);
+        seq_printf(p, "\n");
        return 0;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..9bd8be1d235c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        mm = mm_for_maps(priv->task);
        if (!mm)
                return NULL;
+        down_read(&mm->mmap_sem);
        tail_vma = get_gate_vma(priv->task);
        priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
                priv->task = NULL;
                return NULL;
        }
+        down_read(&mm->mmap_sem);
        /* start from the Nth VMA */
        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 5edcc3f92ba7..0872afa58d39 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -166,12 +166,7 @@ static const struct file_operations proc_vmcore_operations = {
 static struct vmcore* __init get_new_element(void)
 {
-        struct vmcore *p;
+        return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
-        p = kmalloc(sizeof(*p), GFP_KERNEL);
-        if (p)
-                memset(p, 0, sizeof(*p));
-        return p;
 }
 static u64 __init get_vmcore_size_elf64(char *elfptr)
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index 502d7fe98bab..e4d408cc5473 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
 obj-$(CONFIG_QNX4FS_FS) += qnx4.o
-qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o
+qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 8425cf6e9624..e1cd061a25f7 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,14 +13,9 @@
 * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
 */
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include "qnx4.h"
 #if 0
 int qnx4_new_block(struct super_block *sb)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ea9ffefb48ad..003c68f3238b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,14 +11,9 @@
 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
 */
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include "qnx4.h"
 static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
@@ -84,7 +79,7 @@ const struct file_operations qnx4_dir_operations =
 {
        .read           = generic_read_dir,
        .readdir        = qnx4_readdir,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
 };
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 867f42b02035..09b170ac936c 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -12,8 +12,7 @@
 * 27-06-1998 by Frank Denis : file overwriting.
 */
-#include <linux/fs.h>
+#include "qnx4.h"
-#include <linux/qnx4_fs.h>
 /*
 * We have mostly NULL's here: the current defaults are ok for
@@ -29,7 +28,7 @@ const struct file_operations qnx4_file_operations =
 #ifdef CONFIG_QNX4FS_RW
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
-        .fsync          = qnx4_sync_file,
+        .fsync          = simple_fsync,
 #endif
 };
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
deleted file mode 100644
index aa3b19544bee..000000000000
--- a/fs/qnx4/fsync.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* 
- * QNX4 file system, Linux implementation.
- * 
- * Version : 0.1
- * 
- * Using parts of the xiafs filesystem.
- * 
- * History :
- * 
- * 24-03-1998 by Richard Frowijn : first release.
- */
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <asm/system.h>
-/*
- * The functions for qnx4 fs file synchronization.
- */
-#ifdef CONFIG_QNX4FS_RW
-static int sync_block(struct inode *inode, unsigned short *block, int wait)
-{
-        struct buffer_head *bh;
-        unsigned short tmp;
-        if (!*block)
-                return 0;
-        tmp = *block;
-        bh = sb_find_get_block(inode->i_sb, *block);
-        if (!bh)
-                return 0;
-        if (*block != tmp) {
-                brelse(bh);
-                return 1;
-        }
-        if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
-                brelse(bh);
-                return -1;
-        }
-        if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
-                brelse(bh);
-                return 0;
-        }
-        ll_rw_block(WRITE, 1, &bh);
-        atomic_dec(&bh->b_count);
-        return 0;
-}
-#ifdef WTF
-static int sync_iblock(struct inode *inode, unsigned short *iblock,
-                       struct buffer_head **bh, int wait)
-{
-        int rc;
-        unsigned short tmp;
-        *bh = NULL;
-        tmp = *iblock;
-        if (!tmp)
-                return 0;
-        rc = sync_block(inode, iblock, wait);
-        if (rc)
-                return rc;
-        *bh = sb_bread(inode->i_sb, tmp);
-        if (tmp != *iblock) {
-                brelse(*bh);
-                *bh = NULL;
-                return 1;
-        }
-        if (!*bh)
-                return -1;
-        return 0;
-}
-#endif
-static int sync_direct(struct inode *inode, int wait)
-{
-        int i;
-        int rc, err = 0;
-        for (i = 0; i < 7; i++) {
-                rc = sync_block(inode,
-                                (unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait);
-                if (rc > 0)
-                        break;
-                if (rc)
-                        err = rc;
-        }
-        return err;
-}
-#ifdef WTF
-static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait)
-{
-        int i;
-        struct buffer_head *ind_bh;
-        int rc, err = 0;
-        rc = sync_iblock(inode, iblock, &ind_bh, wait);
-        if (rc || !ind_bh)
-                return rc;
-        for (i = 0; i < 512; i++) {
-                rc = sync_block(inode,
-                                ((unsigned short *) ind_bh->b_data) + i,
-                                wait);
-                if (rc > 0)
-                        break;
-                if (rc)
-                        err = rc;
-        }
-        brelse(ind_bh);
-        return err;
-}
-static int sync_dindirect(struct inode *inode, unsigned short *diblock,
-                          int wait)
-{
-        int i;
-        struct buffer_head *dind_bh;
-        int rc, err = 0;
-        rc = sync_iblock(inode, diblock, &dind_bh, wait);
-        if (rc || !dind_bh)
-                return rc;
-        for (i = 0; i < 512; i++) {
-                rc = sync_indirect(inode,
-                                ((unsigned short *) dind_bh->b_data) + i,
-                                   wait);
-                if (rc > 0)
-                        break;
-                if (rc)
-                        err = rc;
-        }
-        brelse(dind_bh);
-        return err;
-}
-#endif
-int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused)
-{
-        struct inode *inode = dentry->d_inode;
-        int wait, err = 0;
-        
-        (void) file;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-              S_ISLNK(inode->i_mode)))
-                return -EINVAL;
-        lock_kernel();
-        for (wait = 0; wait <= 1; wait++) {
-                err |= sync_direct(inode, wait);
-        }
-        err |= qnx4_sync_inode(inode);
-        unlock_kernel();
-        return (err < 0) ? -EIO : 0;
-}
-#endif
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fe1f0f31d11c..681df5fcd161 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -13,19 +13,15 @@
 */
 #include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/vfs.h>
+#include <linux/writeback.h>
-#include <asm/uaccess.h>
+#include <linux/statfs.h>
+#include "qnx4.h"
 #define QNX4_VERSION  4
 #define QNX4_BMNAME   ".bitmap"
@@ -34,31 +30,6 @@ static const struct super_operations qnx4_sops;
 #ifdef CONFIG_QNX4FS_RW
-int qnx4_sync_inode(struct inode *inode)
-{
-        int err = 0;
-# if 0
-        struct buffer_head *bh;
-        bh = qnx4_update_inode(inode);
-        if (bh && buffer_dirty(bh))
-        {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh))
-                {
-                        printk ("IO error syncing qnx4 inode [%s:%08lx]\n",
-                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
-                }
-                brelse (bh);
-        } else if (!bh) {
-                err = -1;
-        }
-# endif
-        return err;
-}
 static void qnx4_delete_inode(struct inode *inode)
 {
        QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
@@ -70,15 +41,7 @@ static void qnx4_delete_inode(struct inode *inode)
        unlock_kernel();
 }
-static void qnx4_write_super(struct super_block *sb)
+static int qnx4_write_inode(struct inode *inode, int do_sync)
-{
-        lock_kernel();
-        QNX4DEBUG(("qnx4: write_super\n"));
-        sb->s_dirt = 0;
-        unlock_kernel();
-}
-static int qnx4_write_inode(struct inode *inode, int unused)
 {
        struct qnx4_inode_entry *raw_inode;
        int block, ino;
@@ -115,6 +78,16 @@ static int qnx4_write_inode(struct inode *inode, int unused)
        raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
        raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
        mark_buffer_dirty(bh);
+        if (do_sync) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        printk("qnx4: IO error syncing inode [%s:%08x]\n",
+                                        inode->i_sb->s_id, ino);
+                        brelse(bh);
+                        unlock_kernel();
+                        return -EIO;
+                }
+        }
        brelse(bh);
        unlock_kernel();
        return 0;
@@ -138,7 +111,6 @@ static const struct super_operations qnx4_sops =
 #ifdef CONFIG_QNX4FS_RW
        .write_inode    = qnx4_write_inode,
        .delete_inode   = qnx4_delete_inode,
-        .write_super    = qnx4_write_super,
 #endif
 };
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 775eed3a4085..5972ed214937 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,16 +12,9 @@
 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
 */
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include "qnx4.h"
 /*
@@ -187,7 +180,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
        de->di_status = 0;
        memset(de->di_fname, 0, sizeof de->di_fname);
        de->di_mode = 0;
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        clear_nlink(inode);
        mark_inode_dirty(inode);
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -232,7 +225,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
        de->di_status = 0;
        memset(de->di_fname, 0, sizeof de->di_fname);
        de->di_mode = 0;
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        mark_inode_dirty(dir);
        inode->i_ctime = dir->i_ctime;
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
new file mode 100644
index 000000000000..9efc089454f6
--- /dev/null
+++ b/fs/qnx4/qnx4.h
@@ -0,0 +1,57 @@
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+#define QNX4_DEBUG 0
+#if QNX4_DEBUG
+#define QNX4DEBUG(X) printk X
+#else
+#define QNX4DEBUG(X) (void) 0
+#endif
+struct qnx4_sb_info {
+        struct buffer_head      *sb_buf;        /* superblock buffer */
+        struct qnx4_super_block *sb;            /* our superblock */
+        unsigned int            Version;        /* may be useful */
+        struct qnx4_inode_entry *BitMap;        /* useful */
+};
+struct qnx4_inode_info {
+        struct qnx4_inode_entry raw;
+        loff_t mmu_private;
+        struct inode vfs_inode;
+};
+extern struct inode *qnx4_iget(struct super_block *, unsigned long);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
+extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
+extern struct buffer_head *qnx4_bread(struct inode *, int, int);
+extern const struct inode_operations qnx4_file_inode_operations;
+extern const struct inode_operations qnx4_dir_inode_operations;
+extern const struct file_operations qnx4_file_operations;
+extern const struct file_operations qnx4_dir_operations;
+extern int qnx4_is_free(struct super_block *sb, long block);
+extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
+extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
+extern void qnx4_truncate(struct inode *inode);
+extern void qnx4_free_inode(struct inode *inode);
+extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
+extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
+static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
+{
+        return container_of(inode, struct qnx4_inode_info, vfs_inode);
+}
+static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
+{
+        return &qnx4_i(inode)->raw;
+}
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 6437c1c3d1dd..d94d9ee241fe 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,12 +10,8 @@
 * 30-06-1998 by Frank DENIS : ugly filler.
 */
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/smp_lock.h>
-#include <asm/uaccess.h>
+#include "qnx4.h"
 #ifdef CONFIG_QNX4FS_RW
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 607c579e5eca..38f7bd559f35 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2042,7 +2042,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                 * changes */
                invalidate_bdev(sb->s_bdev);
        }
-        mutex_lock(&inode->i_mutex);
        mutex_lock(&dqopt->dqonoff_mutex);
        if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
@@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
                down_write(&dqopt->dqptr_sem);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
                                             S_NOQUOTA);
                inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                mutex_unlock(&inode->i_mutex);
                up_write(&dqopt->dqptr_sem);
                sb->dq_op->drop(inode);
        }
@@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                goto out_file_init;
        }
        mutex_unlock(&dqopt->dqio_mutex);
-        mutex_unlock(&inode->i_mutex);
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
        spin_unlock(&dq_state_lock);
@@ -2094,16 +2094,17 @@ out_file_init:
        dqopt->files[type] = NULL;
        iput(inode);
 out_lock:
-        mutex_unlock(&dqopt->dqonoff_mutex);
        if (oldflags != -1) {
                down_write(&dqopt->dqptr_sem);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                /* Set the flags back (in the case of accidental quotaon()
                 * on a wrong file we don't want to mess up the flags) */
                inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
                inode->i_flags |= oldflags;
+                mutex_unlock(&inode->i_mutex);
                up_write(&dqopt->dqptr_sem);
        }
-        mutex_unlock(&inode->i_mutex);
+        mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
        put_quota_format(fmt);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b7f5a468f076..95c5b42384b2 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -159,10 +159,14 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
        return error;
 }
-static void quota_sync_sb(struct super_block *sb, int type)
+#ifdef CONFIG_QUOTA
+void sync_quota_sb(struct super_block *sb, int type)
 {
        int cnt;
+        if (!sb->s_qcop->quota_sync)
+                return;
        sb->s_qcop->quota_sync(sb, type);
        if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
@@ -191,17 +195,13 @@ static void quota_sync_sb(struct super_block *sb, int type)
        }
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 }
+#endif
-void sync_dquots(struct super_block *sb, int type)
+static void sync_dquots(int type)
 {
+        struct super_block *sb;
        int cnt;
-        if (sb) {
-                if (sb->s_qcop->quota_sync)
-                        quota_sync_sb(sb, type);
-                return;
-        }
        spin_lock(&sb_lock);
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
@@ -222,8 +222,8 @@ restart:
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
-                if (sb->s_root && sb->s_qcop->quota_sync)
+                if (sb->s_root)
-                        quota_sync_sb(sb, type);
+                        sync_quota_sb(sb, type);
                up_read(&sb->s_umount);
                spin_lock(&sb_lock);
                if (__put_super_and_need_restart(sb))
@@ -301,7 +301,10 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                        return sb->s_qcop->set_dqblk(sb, type, id, &idq);
                }
                case Q_SYNC:
-                        sync_dquots(sb, type);
+                        if (sb)
+                                sync_quota_sb(sb, type);
+                        else
+                                sync_dquots(type);
                        return 0;
                case Q_XQUOTAON:
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ebb2c417912c..11f0c06316de 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -20,6 +20,7 @@
 #include <linux/ramfs.h>
 #include <linux/pagevec.h>
 #include <linux/mman.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 3a6b193d8444..0ff7566c767c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -202,9 +202,12 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
                                return -EINVAL;
                        opts->mode = option & S_IALLUGO;
                        break;
-                default:
+                /*
-                        printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
+                 * We might like to report bad mount options here;
-                        return -EINVAL;
+                 * but traditionally ramfs has ignored all mount options,
+                 * and as it is used as a !CONFIG_SHMEM simple substitute
+                 * for tmpfs, better continue to ignore other mount options.
+                 */
                }
        }
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76bb9ee1..6c8c55dec2bc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                goto out;
        if (!(in_file->f_mode & FMODE_READ))
                goto fput_in;
-        retval = -EINVAL;
-        in_inode = in_file->f_path.dentry->d_inode;
-        if (!in_inode)
-                goto fput_in;
-        if (!in_file->f_op || !in_file->f_op->splice_read)
-                goto fput_in;
        retval = -ESPIPE;
        if (!ppos)
                ppos = &in_file->f_pos;
@@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        retval = -EINVAL;
        if (!out_file->f_op || !out_file->f_op->sendpage)
                goto fput_out;
+        in_inode = in_file->f_path.dentry->d_inode;
        out_inode = out_file->f_path.dentry->d_inode;
        retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
        if (retval < 0)
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 45ee3d357c70..6d2668fdc384 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -44,13 +44,11 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 static inline bool is_privroot_deh(struct dentry *dir,
                                   struct reiserfs_de_head *deh)
 {
-        int ret = 0;
-#ifdef CONFIG_REISERFS_FS_XATTR
        struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-        ret = (dir == dir->d_parent && privroot->d_inode &&
+        if (reiserfs_expose_privroot(dir->d_sb))
-               deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+                return 0;
-#endif
+        return (dir == dir->d_parent && privroot->d_inode &&
-        return ret;
+                deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 4beb964a2a3e..128d3f7c8aa5 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -1270,9 +1270,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                                        RFALSE(ih, "PAP-12210: ih must be 0");
-                                        if (is_direntry_le_ih
+                                        aux_ih = B_N_PITEM_HEAD(tbS0, item_pos);
-                                            (aux_ih =
+                                        if (is_direntry_le_ih(aux_ih)) {
-                                             B_N_PITEM_HEAD(tbS0, item_pos))) {
                                                /* we append to directory item */
                                                int entry_count;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6fd0f47e45db..a14d6cd9eeda 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1131,8 +1131,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
        REISERFS_I(inode)->i_trans_id = 0;
        REISERFS_I(inode)->i_jl = NULL;
        mutex_init(&(REISERFS_I(inode)->i_mmap));
-        reiserfs_init_acl_access(inode);
-        reiserfs_init_acl_default(inode);
        reiserfs_init_xattr_rwsem(inode);
        if (stat_data_v1(ih)) {
@@ -1834,8 +1832,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
            REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
        sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
        mutex_init(&(REISERFS_I(inode)->i_mmap));
-        reiserfs_init_acl_access(inode);
-        reiserfs_init_acl_default(inode);
        reiserfs_init_xattr_rwsem(inode);
        /* key to search for correct place for new stat data */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 77f5bb746bf0..90622200b39c 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -997,7 +997,7 @@ static int reiserfs_async_progress_wait(struct super_block *s)
        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
        if (atomic_read(&j->j_async_throttle))
-                congestion_wait(WRITE, HZ / 10);
+                congestion_wait(BLK_RW_ASYNC, HZ / 10);
        return 0;
 }
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 381750a155f6..03d85cbf90bf 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -390,7 +390,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
        if (last_first == FIRST_TO_LAST) {
                /* if ( if item in position item_num in buffer SOURCE is directory item ) */
-                if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+                ih = B_N_PITEM_HEAD(src, item_num);
+                if (is_direntry_le_ih(ih))
                        leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
                                              item_num, 0, cpy_bytes);
                else {
@@ -418,7 +419,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
                }
        } else {
                /*  if ( if item in position item_num in buffer SOURCE is directory item ) */
-                if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+                ih = B_N_PITEM_HEAD(src, item_num);
+                if (is_direntry_le_ih(ih))
                        leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
                                              item_num,
                                              I_ENTRY_COUNT(ih) - cpy_bytes,
@@ -774,8 +776,8 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
                        leaf_delete_items_entirely(cur_bi, first + 1,
                                                   del_num - 1);
-                        if (is_direntry_le_ih
+                        ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1);
-                            (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1)))
+                        if (is_direntry_le_ih(ih))
                                /* the last item is directory  */
                                /* len = numbers of directory entries in this item */
                                len = ih_entry_count(ih);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 238e9d9b31e0..18b315d3d104 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -82,7 +82,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
                if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
                        printk
                            ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
-                        unlock_super(s);
                        return -ENOMEM;
                }
                /* the new journal bitmaps are zero filled, now we copy in the bitmap
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3567fb9e3fb1..7adea74d6a8a 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -24,10 +24,10 @@
 #include <linux/exportfs.h>
 #include <linux/quotaops.h>
 #include <linux/vfs.h>
-#include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 struct file_system_type reiserfs_fs_type;
@@ -64,18 +64,15 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
-        if (!(s->s_flags & MS_RDONLY)) {
+        struct reiserfs_transaction_handle th;
-                struct reiserfs_transaction_handle th;
-                reiserfs_write_lock(s);
+        reiserfs_write_lock(s);
-                if (!journal_begin(&th, s, 1))
+        if (!journal_begin(&th, s, 1))
-                        if (!journal_end_sync(&th, s, 1))
+                if (!journal_end_sync(&th, s, 1))
-                                reiserfs_flush_old_commits(s);
+                        reiserfs_flush_old_commits(s);
-                s->s_dirt = 0;  /* Even if it's not true.
+        s->s_dirt = 0;  /* Even if it's not true.
-                                 * We'll loop forever in sync_supers otherwise */
+                         * We'll loop forever in sync_supers otherwise */
-                reiserfs_write_unlock(s);
+        reiserfs_write_unlock(s);
-        } else {
-                s->s_dirt = 0;
-        }
        return 0;
 }
@@ -468,6 +465,11 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
+        lock_kernel();
+        if (s->s_dirt)
+                reiserfs_write_super(s);
        /* change file system state to current state if it was mounted with read-write permissions */
        if (!(s->s_flags & MS_RDONLY)) {
                if (!journal_begin(&th, s, 10)) {
@@ -500,7 +502,7 @@ static void reiserfs_put_super(struct super_block *s)
        kfree(s->s_fs_info);
        s->s_fs_info = NULL;
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -526,10 +528,6 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        inode_init_once(&ei->vfs_inode);
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-        ei->i_acl_access = NULL;
-        ei->i_acl_default = NULL;
-#endif
 }
 static int init_inodecache(void)
@@ -577,25 +575,6 @@ static void reiserfs_dirty_inode(struct inode *inode)
        reiserfs_write_unlock(inode->i_sb);
 }
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-static void reiserfs_clear_inode(struct inode *inode)
-{
-        struct posix_acl *acl;
-        acl = REISERFS_I(inode)->i_acl_access;
-        if (acl && !IS_ERR(acl))
-                posix_acl_release(acl);
-        REISERFS_I(inode)->i_acl_access = NULL;
-        acl = REISERFS_I(inode)->i_acl_default;
-        if (acl && !IS_ERR(acl))
-                posix_acl_release(acl);
-        REISERFS_I(inode)->i_acl_default = NULL;
-}
-#else
-#define reiserfs_clear_inode NULL
-#endif
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
                                    size_t, loff_t);
@@ -609,7 +588,6 @@ static const struct super_operations reiserfs_sops = {
        .write_inode = reiserfs_write_inode,
        .dirty_inode = reiserfs_dirty_inode,
        .delete_inode = reiserfs_delete_inode,
-        .clear_inode = reiserfs_clear_inode,
        .put_super = reiserfs_put_super,
        .write_super = reiserfs_write_super,
        .sync_fs = reiserfs_sync_fs,
@@ -898,6 +876,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                {"conv",.setmask = 1 << REISERFS_CONVERT},
                {"attrs",.setmask = 1 << REISERFS_ATTRS},
                {"noattrs",.clrmask = 1 << REISERFS_ATTRS},
+                {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
 #ifdef CONFIG_REISERFS_FS_XATTR
                {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
                {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
@@ -1193,6 +1172,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
+        lock_kernel();
        rs = SB_DISK_SUPER_BLOCK(s);
        if (!reiserfs_parse_options
@@ -1315,10 +1295,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 out_ok:
        replace_mount_options(s, new_opts);
+        unlock_kernel();
        return 0;
 out_err:
        kfree(new_opts);
+        unlock_kernel();
        return err;
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8e7deb0e6964..6925b835a43b 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -46,7 +46,6 @@
 #include <linux/reiserfs_acl.h>
 #include <asm/uaccess.h>
 #include <net/checksum.h>
-#include <linux/smp_lock.h>
 #include <linux/stat.h>
 #include <linux/quotaops.h>
@@ -981,7 +980,8 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                s->s_root->d_op = &xattr_lookup_poison_ops;
+                if (!reiserfs_expose_privroot(s))
+                        s->s_root->d_op = &xattr_lookup_poison_ops;
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index c303c426fe2b..35d6e672a279 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -188,29 +188,6 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
        return ERR_PTR(-EINVAL);
 }
-static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
-                            struct posix_acl *acl)
-{
-        spin_lock(&inode->i_lock);
-        if (*i_acl != ERR_PTR(-ENODATA))
-                posix_acl_release(*i_acl);
-        *i_acl = posix_acl_dup(acl);
-        spin_unlock(&inode->i_lock);
-}
-static inline struct posix_acl *iget_acl(struct inode *inode,
-                                         struct posix_acl **i_acl)
-{
-        struct posix_acl *acl = ERR_PTR(-ENODATA);
-        spin_lock(&inode->i_lock);
-        if (*i_acl != ERR_PTR(-ENODATA))
-                acl = posix_acl_dup(*i_acl);
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
 /*
 * Inode operation get_posix_acl().
 *
@@ -220,34 +197,29 @@ static inline struct posix_acl *iget_acl(struct inode *inode,
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 {
        char *name, *value;
-        struct posix_acl *acl, **p_acl;
+        struct posix_acl *acl;
        int size;
        int retval;
-        struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &reiserfs_i->i_acl_access;
                break;
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &reiserfs_i->i_acl_default;
                break;
        default:
-                return ERR_PTR(-EINVAL);
+                BUG();
        }
-        acl = iget_acl(inode, p_acl);
-        if (acl && !IS_ERR(acl))
-                return acl;
-        else if (PTR_ERR(acl) == -ENODATA)
-                return NULL;
        size = reiserfs_xattr_get(inode, name, NULL, 0);
        if (size < 0) {
                if (size == -ENODATA || size == -ENOSYS) {
-                        *p_acl = ERR_PTR(-ENODATA);
+                        set_cached_acl(inode, type, NULL);
                        return NULL;
                }
                return ERR_PTR(size);
@@ -262,14 +234,13 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
                /* This shouldn't actually happen as it should have
                   been caught above.. but just in case */
                acl = NULL;
-                *p_acl = ERR_PTR(-ENODATA);
        } else if (retval < 0) {
                acl = ERR_PTR(retval);
        } else {
                acl = posix_acl_from_disk(value, retval);
-                if (!IS_ERR(acl))
-                        iset_acl(inode, p_acl, acl);
        }
+        if (!IS_ERR(acl))
+                set_cached_acl(inode, type, acl);
        kfree(value);
        return acl;
@@ -287,10 +258,8 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 {
        char *name;
        void *value = NULL;
-        struct posix_acl **p_acl;
        size_t size = 0;
        int error;
-        struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
@@ -298,7 +267,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &reiserfs_i->i_acl_access;
                if (acl) {
                        mode_t mode = inode->i_mode;
                        error = posix_acl_equiv_mode(acl, &mode);
@@ -313,7 +281,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
                break;
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &reiserfs_i->i_acl_default;
                if (!S_ISDIR(inode->i_mode))
                        return acl ? -EACCES : 0;
                break;
@@ -346,7 +313,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
        kfree(value);
        if (!error)
-                iset_acl(inode, p_acl, acl);
+                set_cached_acl(inode, type, acl);
        return error;
 }
@@ -379,11 +346,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
        }
        acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
-        if (IS_ERR(acl)) {
+        if (IS_ERR(acl))
-                if (PTR_ERR(acl) == -ENODATA)
-                        goto apply_umask;
                return PTR_ERR(acl);
-        }
        if (acl) {
                struct posix_acl *acl_copy;
diff --git a/fs/select.c b/fs/select.c
index 0fe0e1469df3..8084834e123e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq)
 {
        init_poll_funcptr(&pwq->pt, __pollwait);
        pwq->polling_task = current;
+        pwq->triggered = 0;
        pwq->error = 0;
        pwq->table = NULL;
        pwq->inline_index = 0;
@@ -168,7 +169,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
        return table->entry++;
 }
-static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
        struct poll_wqueues *pwq = wait->private;
        DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -194,6 +195,16 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
        return default_wake_function(&dummy_wait, mode, sync, key);
 }
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+        struct poll_table_entry *entry;
+        entry = container_of(wait, struct poll_table_entry, wait);
+        if (key && !((unsigned long)key & entry->key))
+                return 0;
+        return __pollwake(wait, mode, sync, key);
+}
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                                poll_table *p)
@@ -205,6 +216,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
        get_file(filp);
        entry->filp = filp;
        entry->wait_address = wait_address;
+        entry->key = p->key;
        init_waitqueue_func_entry(&entry->wait, pollwake);
        entry->wait.private = pwq;
        add_wait_queue(wait_address, &entry->wait);
@@ -362,6 +374,18 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
+static inline void wait_key_set(poll_table *wait, unsigned long in,
+                                unsigned long out, unsigned long bit)
+{
+        if (wait) {
+                wait->key = POLLEX_SET;
+                if (in & bit)
+                        wait->key |= POLLIN_SET;
+                if (out & bit)
+                        wait->key |= POLLOUT_SET;
+        }
+}
 int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 {
        ktime_t expire, *to = NULL;
@@ -418,20 +442,25 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                                if (file) {
                                        f_op = file->f_op;
                                        mask = DEFAULT_POLLMASK;
-                                        if (f_op && f_op->poll)
+                                        if (f_op && f_op->poll) {
-                                                mask = (*f_op->poll)(file, retval ? NULL : wait);
+                                                wait_key_set(wait, in, out, bit);
+                                                mask = (*f_op->poll)(file, wait);
+                                        }
                                        fput_light(file, fput_needed);
                                        if ((mask & POLLIN_SET) && (in & bit)) {
                                                res_in |= bit;
                                                retval++;
+                                                wait = NULL;
                                        }
                                        if ((mask & POLLOUT_SET) && (out & bit)) {
                                                res_out |= bit;
                                                retval++;
+                                                wait = NULL;
                                        }
                                        if ((mask & POLLEX_SET) && (ex & bit)) {
                                                res_ex |= bit;
                                                retval++;
+                                                wait = NULL;
                                        }
                                }
                        }
@@ -685,8 +714,12 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
                mask = POLLNVAL;
                if (file != NULL) {
                        mask = DEFAULT_POLLMASK;
-                        if (file->f_op && file->f_op->poll)
+                        if (file->f_op && file->f_op->poll) {
+                                if (pwait)
+                                        pwait->key = pollfd->events |
+                                                        POLLERR | POLLHUP;
                                mask = file->f_op->poll(file, pwait);
+                        }
                        /* Mask out unneeded events. */
                        mask &= pollfd->events | POLLERR | POLLHUP;
                        fput_light(file, fput_needed);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7f40f30c55c5..6c959275f2d0 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -640,6 +640,26 @@ int seq_puts(struct seq_file *m, const char *s)
 }
 EXPORT_SYMBOL(seq_puts);
+/**
+ * seq_write - write arbitrary data to buffer
+ * @seq: seq_file identifying the buffer to which data should be written
+ * @data: data address
+ * @len: number of bytes
+ *
+ * Return 0 on success, non-zero otherwise.
+ */
+int seq_write(struct seq_file *seq, const void *data, size_t len)
+{
+        if (seq->count + len < seq->size) {
+                memcpy(seq->buf + seq->count, data, len);
+                seq->count += len;
+                return 0;
+        }
+        seq->count = seq->size;
+        return -1;
+}
+EXPORT_SYMBOL(seq_write);
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
        struct list_head *lh;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fc27fbfc5397..1402d2d54f52 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -474,6 +474,8 @@ smb_put_super(struct super_block *sb)
 {
        struct smb_sb_info *server = SMB_SB(sb);
+        lock_kernel();
        smb_lock_server(server);
        server->state = CONN_INVALID;
        smbiod_unregister_server(server);
@@ -489,6 +491,8 @@ smb_put_super(struct super_block *sb)
        smb_unlock_server(server);
        put_pid(server->conn_pid);
        kfree(server);
+        unlock_kernel();
 }
 static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
diff --git a/fs/splice.c b/fs/splice.c
index 666953d59a35..73766d24f97b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -507,9 +507,131 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
        return ret;
 }
 EXPORT_SYMBOL(generic_file_splice_read);
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+        .can_merge = 0,
+        .map = generic_pipe_buf_map,
+        .unmap = generic_pipe_buf_unmap,
+        .confirm = generic_pipe_buf_confirm,
+        .release = generic_pipe_buf_release,
+        .steal = generic_pipe_buf_steal,
+        .get = generic_pipe_buf_get,
+};
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+                            unsigned long vlen, loff_t offset)
+{
+        mm_segment_t old_fs;
+        loff_t pos = offset;
+        ssize_t res;
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* The cast to a user pointer is valid due to the set_fs() */
+        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+        set_fs(old_fs);
+        return res;
+}
+static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+                            loff_t pos)
+{
+        mm_segment_t old_fs;
+        ssize_t res;
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* The cast to a user pointer is valid due to the set_fs() */
+        res = vfs_write(file, (const char __user *)buf, count, &pos);
+        set_fs(old_fs);
+        return res;
+}
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+                                 struct pipe_inode_info *pipe, size_t len,
+                                 unsigned int flags)
+{
+        unsigned int nr_pages;
+        unsigned int nr_freed;
+        size_t offset;
+        struct page *pages[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_BUFFERS];
+        struct iovec vec[PIPE_BUFFERS];
+        pgoff_t index;
+        ssize_t res;
+        size_t this_len;
+        int error;
+        int i;
+        struct splice_pipe_desc spd = {
+                .pages = pages,
+                .partial = partial,
+                .flags = flags,
+                .ops = &default_pipe_buf_ops,
+                .spd_release = spd_release_page,
+        };
+        index = *ppos >> PAGE_CACHE_SHIFT;
+        offset = *ppos & ~PAGE_CACHE_MASK;
+        nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+                struct page *page;
+                page = alloc_page(GFP_USER);
+                error = -ENOMEM;
+                if (!page)
+                        goto err;
+                this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+                vec[i].iov_base = (void __user *) page_address(page);
+                vec[i].iov_len = this_len;
+                pages[i] = page;
+                spd.nr_pages++;
+                len -= this_len;
+                offset = 0;
+        }
+        res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+        if (res < 0) {
+                error = res;
+                goto err;
+        }
+        error = 0;
+        if (!res)
+                goto err;
+        nr_freed = 0;
+        for (i = 0; i < spd.nr_pages; i++) {
+                this_len = min_t(size_t, vec[i].iov_len, res);
+                partial[i].offset = 0;
+                partial[i].len = this_len;
+                if (!this_len) {
+                        __free_page(pages[i]);
+                        pages[i] = NULL;
+                        nr_freed++;
+                }
+                res -= this_len;
+        }
+        spd.nr_pages -= nr_freed;
+        res = splice_to_pipe(pipe, &spd);
+        if (res > 0)
+                *ppos += res;
+        return res;
+err:
+        for (i = 0; i < spd.nr_pages; i++)
+                __free_page(pages[i]);
+        return error;
+}
+EXPORT_SYMBOL(default_file_splice_read);
 /*
 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 * using sendpage(). Return the number of bytes sent.
@@ -881,6 +1003,36 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 EXPORT_SYMBOL(generic_file_splice_write);
+static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+                          struct splice_desc *sd)
+{
+        int ret;
+        void *data;
+        ret = buf->ops->confirm(pipe, buf);
+        if (ret)
+                return ret;
+        data = buf->ops->map(pipe, buf, 0);
+        ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+        buf->ops->unmap(pipe, buf, data);
+        return ret;
+}
+static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
+                                         struct file *out, loff_t *ppos,
+                                         size_t len, unsigned int flags)
+{
+        ssize_t ret;
+        ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
+        if (ret > 0)
+                *ppos += ret;
+        return ret;
+}
 /**
 * generic_splice_sendpage - splice data from a pipe to a socket
 * @pipe:       pipe to splice from
@@ -908,11 +1060,10 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
                           loff_t *ppos, size_t len, unsigned int flags)
 {
+        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+                                loff_t *, size_t, unsigned int);
        int ret;
-        if (unlikely(!out->f_op || !out->f_op->splice_write))
-                return -EINVAL;
        if (unlikely(!(out->f_mode & FMODE_WRITE)))
                return -EBADF;
@@ -923,7 +1074,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (unlikely(ret < 0))
                return ret;
-        return out->f_op->splice_write(pipe, out, ppos, len, flags);
+        splice_write = out->f_op->splice_write;
+        if (!splice_write)
+                splice_write = default_file_splice_write;
+        return splice_write(pipe, out, ppos, len, flags);
 }
 /*
@@ -933,11 +1088,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe, size_t len,
                         unsigned int flags)
 {
+        ssize_t (*splice_read)(struct file *, loff_t *,
+                               struct pipe_inode_info *, size_t, unsigned int);
        int ret;
-        if (unlikely(!in->f_op || !in->f_op->splice_read))
-                return -EINVAL;
        if (unlikely(!(in->f_mode & FMODE_READ)))
                return -EBADF;
@@ -945,7 +1099,11 @@ static long do_splice_to(struct file *in, loff_t *ppos,
        if (unlikely(ret < 0))
                return ret;
-        return in->f_op->splice_read(in, ppos, pipe, len, flags);
+        splice_read = in->f_op->splice_read;
+        if (!splice_read)
+                splice_read = default_file_splice_read;
+        return splice_read(in, ppos, pipe, len, flags);
 }
 /**
@@ -1112,6 +1270,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
        return ret;
 }
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+                               struct pipe_inode_info *opipe,
+                               size_t len, unsigned int flags);
 /*
 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
 * location, so checking ->i_pipe is not enough to verify that this is a
@@ -1132,12 +1293,32 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                      struct file *out, loff_t __user *off_out,
                      size_t len, unsigned int flags)
 {
-        struct pipe_inode_info *pipe;
+        struct pipe_inode_info *ipipe;
+        struct pipe_inode_info *opipe;
        loff_t offset, *off;
        long ret;
-        pipe = pipe_info(in->f_path.dentry->d_inode);
+        ipipe = pipe_info(in->f_path.dentry->d_inode);
-        if (pipe) {
+        opipe = pipe_info(out->f_path.dentry->d_inode);
+        if (ipipe && opipe) {
+                if (off_in || off_out)
+                        return -ESPIPE;
+                if (!(in->f_mode & FMODE_READ))
+                        return -EBADF;
+                if (!(out->f_mode & FMODE_WRITE))
+                        return -EBADF;
+                /* Splicing to self would be fun, but... */
+                if (ipipe == opipe)
+                        return -EINVAL;
+                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
+        }
+        if (ipipe) {
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
@@ -1149,7 +1330,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                } else
                        off = &out->f_pos;
-                ret = do_splice_from(pipe, out, off, len, flags);
+                ret = do_splice_from(ipipe, out, off, len, flags);
                if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
                        ret = -EFAULT;
@@ -1157,8 +1338,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                return ret;
        }
-        pipe = pipe_info(out->f_path.dentry->d_inode);
+        if (opipe) {
-        if (pipe) {
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
@@ -1170,7 +1350,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                } else
                        off = &in->f_pos;
-                ret = do_splice_to(in, off, pipe, len, flags);
+                ret = do_splice_to(in, off, opipe, len, flags);
                if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
                        ret = -EFAULT;
@@ -1511,7 +1691,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 * Make sure there's data to read. Wait for input if we can, otherwise
 * return an appropriate error.
 */
-static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
        int ret;
@@ -1549,7 +1729,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 * Make sure there's writeable room. Wait for room if we can, otherwise
 * return an appropriate error.
 */
-static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
        int ret;
@@ -1587,6 +1767,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 }
 /*
+ * Splice contents of ipipe to opipe.
+ */
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+                               struct pipe_inode_info *opipe,
+                               size_t len, unsigned int flags)
+{
+        struct pipe_buffer *ibuf, *obuf;
+        int ret = 0, nbuf;
+        bool input_wakeup = false;
+retry:
+        ret = ipipe_prep(ipipe, flags);
+        if (ret)
+                return ret;
+        ret = opipe_prep(opipe, flags);
+        if (ret)
+                return ret;
+        /*
+         * Potential ABBA deadlock, work around it by ordering lock
+         * grabbing by pipe info address. Otherwise two different processes
+         * could deadlock (one doing tee from A -> B, the other from B -> A).
+         */
+        pipe_double_lock(ipipe, opipe);
+        do {
+                if (!opipe->readers) {
+                        send_sig(SIGPIPE, current, 0);
+                        if (!ret)
+                                ret = -EPIPE;
+                        break;
+                }
+                if (!ipipe->nrbufs && !ipipe->writers)
+                        break;
+                /*
+                 * Cannot make any progress, because either the input
+                 * pipe is empty or the output pipe is full.
+                 */
+                if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+                        /* Already processed some buffers, break */
+                        if (ret)
+                                break;
+                        if (flags & SPLICE_F_NONBLOCK) {
+                                ret = -EAGAIN;
+                                break;
+                        }
+                        /*
+                         * We raced with another reader/writer and haven't
+                         * managed to process any buffers.  A zero return
+                         * value means EOF, so retry instead.
+                         */
+                        pipe_unlock(ipipe);
+                        pipe_unlock(opipe);
+                        goto retry;
+                }
+                ibuf = ipipe->bufs + ipipe->curbuf;
+                nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+                obuf = opipe->bufs + nbuf;
+                if (len >= ibuf->len) {
+                        /*
+                         * Simply move the whole buffer from ipipe to opipe
+                         */
+                        *obuf = *ibuf;
+                        ibuf->ops = NULL;
+                        opipe->nrbufs++;
+                        ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+                        ipipe->nrbufs--;
+                        input_wakeup = true;
+                } else {
+                        /*
+                         * Get a reference to this pipe buffer,
+                         * so we can copy the contents over.
+                         */
+                        ibuf->ops->get(ipipe, ibuf);
+                        *obuf = *ibuf;
+                        /*
+                         * Don't inherit the gift flag, we need to
+                         * prevent multiple steals of this page.
+                         */
+                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+                        obuf->len = len;
+                        opipe->nrbufs++;
+                        ibuf->offset += obuf->len;
+                        ibuf->len -= obuf->len;
+                }
+                ret += obuf->len;
+                len -= obuf->len;
+        } while (len);
+        pipe_unlock(ipipe);
+        pipe_unlock(opipe);
+        /*
+         * If we put data in the output pipe, wakeup any potential readers.
+         */
+        if (ret > 0) {
+                smp_mb();
+                if (waitqueue_active(&opipe->wait))
+                        wake_up_interruptible(&opipe->wait);
+                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+        }
+        if (input_wakeup)
+                wakeup_pipe_writers(ipipe);
+        return ret;
+}
+/*
 * Link contents of ipipe to opipe.
 */
 static int link_pipe(struct pipe_inode_info *ipipe,
@@ -1690,9 +1988,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
                 * Keep going, unless we encounter an error. The ipipe/opipe
                 * ordering doesn't really matter.
                 */
-                ret = link_ipipe_prep(ipipe, flags);
+                ret = ipipe_prep(ipipe, flags);
                if (!ret) {
-                        ret = link_opipe_prep(opipe, flags);
+                        ret = opipe_prep(opipe, flags);
                        if (!ret)
                                ret = link_pipe(ipipe, opipe, len, flags);
                }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 0adc624c956f..cb5fc57e370b 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,6 +30,7 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
@@ -338,6 +339,8 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 static void squashfs_put_super(struct super_block *sb)
 {
+        lock_kernel();
        if (sb->s_fs_info) {
                struct squashfs_sb_info *sbi = sb->s_fs_info;
                squashfs_cache_delete(sbi->block_cache);
@@ -350,6 +353,8 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
+        unlock_kernel();
 }
diff --git a/fs/super.c b/fs/super.c
index 1943fdf655fa..2761d3e22ed9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -28,7 +28,6 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/namei.h>
-#include <linux/buffer_head.h>          /* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -38,7 +37,6 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
-#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -72,7 +70,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
-                INIT_LIST_HEAD(&s->s_async_list);
                init_rwsem(&s->s_umount);
                mutex_init(&s->s_lock);
                lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -285,38 +282,6 @@ void unlock_super(struct super_block * sb)
 EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.  Requires a second blkdev
- * flush by the caller to complete the operation.
- */
-void __fsync_super(struct super_block *sb)
-{
-        sync_inodes_sb(sb, 0);
-        vfs_dq_sync(sb);
-        lock_super(sb);
-        if (sb->s_dirt && sb->s_op->write_super)
-                sb->s_op->write_super(sb);
-        unlock_super(sb);
-        if (sb->s_op->sync_fs)
-                sb->s_op->sync_fs(sb, 1);
-        sync_blockdev(sb->s_bdev);
-        sync_inodes_sb(sb, 1);
-}
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_super(struct super_block *sb)
-{
-        __fsync_super(sb);
-        return sync_blockdev(sb->s_bdev);
-}
-EXPORT_SYMBOL_GPL(fsync_super);
 /**
 *      generic_shutdown_super  -       common helper for ->kill_sb()
 *      @sb: superblock to kill
@@ -338,21 +303,13 @@ void generic_shutdown_super(struct super_block *sb)
        if (sb->s_root) {
                shrink_dcache_for_umount(sb);
-                fsync_super(sb);
+                sync_filesystem(sb);
-                lock_super(sb);
+                get_fs_excl();
                sb->s_flags &= ~MS_ACTIVE;
-                /*
-                 * wait for asynchronous fs operations to finish before going further
-                 */
-                async_synchronize_full_domain(&sb->s_async_list);
                /* bad name - it should be evict_inodes() */
                invalidate_inodes(sb);
-                lock_kernel();
-                if (sop->write_super && sb->s_dirt)
-                        sop->write_super(sb);
                if (sop->put_super)
                        sop->put_super(sb);
@@ -362,9 +319,7 @@ void generic_shutdown_super(struct super_block *sb)
                           "Self-destruct in 5 seconds.  Have a nice day...\n",
                           sb->s_id);
                }
+                put_fs_excl();
-                unlock_kernel();
-                unlock_super(sb);
        }
        spin_lock(&sb_lock);
        /* should be initialized for __put_super_and_need_restart() */
@@ -441,16 +396,14 @@ void drop_super(struct super_block *sb)
 EXPORT_SYMBOL(drop_super);
-static inline void write_super(struct super_block *sb)
+/**
-{
+ * sync_supers - helper for periodic superblock writeback
-        lock_super(sb);
+ *
-        if (sb->s_root && sb->s_dirt)
+ * Call the write_super method if present on all dirty superblocks in
-                if (sb->s_op->write_super)
+ * the system.  This is for the periodic writeback used by most older
-                        sb->s_op->write_super(sb);
+ * filesystems.  For data integrity superblock writeback use
-        unlock_super(sb);
+ * sync_filesystems() instead.
-}
+ *
-/*
 * Note: check the dirty flag before waiting, so we don't
 * hold up the sync while mounting a device. (The newly
 * mounted device won't need syncing.)
@@ -462,12 +415,15 @@ void sync_supers(void)
        spin_lock(&sb_lock);
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_dirt) {
+                if (sb->s_op->write_super && sb->s_dirt) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
-                        write_super(sb);
+                        if (sb->s_root && sb->s_dirt)
+                                sb->s_op->write_super(sb);
                        up_read(&sb->s_umount);
                        spin_lock(&sb_lock);
                        if (__put_super_and_need_restart(sb))
                                goto restart;
@@ -476,60 +432,6 @@ restart:
        spin_unlock(&sb_lock);
 }
-/*
- * Call the ->sync_fs super_op against all filesystems which are r/w and
- * which implement it.
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync_fs
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync_fs
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
- *
- * (Fabian) Avoid sync_fs with clean fs & wait mode 0
- */
-void sync_filesystems(int wait)
-{
-        struct super_block *sb;
-        static DEFINE_MUTEX(mutex);
-        mutex_lock(&mutex);             /* Could be down_interruptible */
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_op->sync_fs)
-                        continue;
-                if (sb->s_flags & MS_RDONLY)
-                        continue;
-                sb->s_need_sync_fs = 1;
-        }
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_need_sync_fs)
-                        continue;
-                sb->s_need_sync_fs = 0;
-                if (sb->s_flags & MS_RDONLY)
-                        continue;       /* hm.  Was remounted r/o meanwhile */
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                async_synchronize_full_domain(&sb->s_async_list);
-                if (sb->s_root && (wait || sb->s_dirt))
-                        sb->s_op->sync_fs(sb, wait);
-                up_read(&sb->s_umount);
-                /* restart only when sb is no longer on the list */
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-        mutex_unlock(&mutex);
-}
 /**
 *      get_super - get the superblock of a device
 *      @bdev: device to get the superblock for
@@ -616,45 +518,6 @@ out:
 }
 /**
- *      mark_files_ro - mark all files read-only
- *      @sb: superblock in question
- *
- *      All files are marked read-only.  We don't care about pending
- *      delete files so this should be used in 'force' mode only.
- */
-static void mark_files_ro(struct super_block *sb)
-{
-        struct file *f;
-retry:
-        file_list_lock();
-        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
-                struct vfsmount *mnt;
-                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
-                       continue;
-                if (!file_count(f))
-                        continue;
-                if (!(f->f_mode & FMODE_WRITE))
-                        continue;
-                f->f_mode &= ~FMODE_WRITE;
-                if (file_check_writeable(f) != 0)
-                        continue;
-                file_release_write(f);
-                mnt = mntget(f->f_path.mnt);
-                file_list_unlock();
-                /*
-                 * This can sleep, so we can't hold
-                 * the file_list_lock() spinlock.
-                 */
-                mnt_drop_write(mnt);
-                mntput(mnt);
-                goto retry;
-        }
-        file_list_unlock();
-}
-/**
 *      do_remount_sb - asks filesystem to change mount options.
 *      @sb:    superblock in question
 *      @flags: numeric part of options
@@ -675,7 +538,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        if (flags & MS_RDONLY)
                acct_auto_close(sb);
        shrink_dcache_sb(sb);
-        fsync_super(sb);
+        sync_filesystem(sb);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
@@ -691,9 +554,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        if (sb->s_op->remount_fs) {
-                lock_super(sb);
                retval = sb->s_op->remount_fs(sb, &flags, data);
-                unlock_super(sb);
                if (retval)
                        return retval;
        }
@@ -711,18 +572,17 @@ static void do_emergency_remount(struct work_struct *work)
        list_for_each_entry(sb, &super_blocks, s_list) {
                sb->s_count++;
                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
+                down_write(&sb->s_umount);
                if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
                        /*
                         * ->remount_fs needs lock_kernel().
                         *
                         * What lock protects sb->s_flags??
                         */
-                        lock_kernel();
                        do_remount_sb(sb, MS_RDONLY, NULL, 1);
-                        unlock_kernel();
                }
-                drop_super(sb);
+                up_write(&sb->s_umount);
+                put_super(sb);
                spin_lock(&sb_lock);
        }
        spin_unlock(&sb_lock);
@@ -748,6 +608,7 @@ void emergency_remount(void)
 static DEFINE_IDA(unnamed_dev_ida);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
+static int unnamed_dev_start = 0; /* don't bother trying below it */
 int set_anon_super(struct super_block *s, void *data)
 {
@@ -758,7 +619,9 @@ int set_anon_super(struct super_block *s, void *data)
        if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
                return -ENOMEM;
        spin_lock(&unnamed_dev_lock);
-        error = ida_get_new(&unnamed_dev_ida, &dev);
+        error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
+        if (!error)
+                unnamed_dev_start = dev + 1;
        spin_unlock(&unnamed_dev_lock);
        if (error == -EAGAIN)
                /* We raced and lost with another CPU. */
@@ -769,6 +632,8 @@ int set_anon_super(struct super_block *s, void *data)
        if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
                spin_lock(&unnamed_dev_lock);
                ida_remove(&unnamed_dev_ida, dev);
+                if (unnamed_dev_start > dev)
+                        unnamed_dev_start = dev;
                spin_unlock(&unnamed_dev_lock);
                return -EMFILE;
        }
@@ -785,6 +650,8 @@ void kill_anon_super(struct super_block *sb)
        generic_shutdown_super(sb);
        spin_lock(&unnamed_dev_lock);
        ida_remove(&unnamed_dev_ida, slot);
+        if (slot < unnamed_dev_start)
+                unnamed_dev_start = slot;
        spin_unlock(&unnamed_dev_lock);
 }
diff --git a/fs/sync.c b/fs/sync.c
index 7abc65fbf21d..3422ba61d86d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -13,38 +13,128 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include "internal.h"
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
                        SYNC_FILE_RANGE_WAIT_AFTER)
 /*
- * sync everything.  Start out by waking pdflush, because that writes back
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * all queues in parallel.
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
 */
-static void do_sync(unsigned long wait)
+static int __sync_filesystem(struct super_block *sb, int wait)
 {
-        wakeup_pdflush(0);
+        /* Avoid doing twice syncing and cache pruning for quota sync */
-        sync_inodes(0);         /* All mappings, inodes and their blockdevs */
-        vfs_dq_sync(NULL);
-        sync_supers();          /* Write the superblocks */
-        sync_filesystems(0);    /* Start syncing the filesystems */
-        sync_filesystems(wait); /* Waitingly sync the filesystems */
-        sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
        if (!wait)
-                printk("Emergency Sync complete\n");
+                writeout_quota_sb(sb, -1);
-        if (unlikely(laptop_mode))
+        else
-                laptop_sync_completion();
+                sync_quota_sb(sb, -1);
+        sync_inodes_sb(sb, wait);
+        if (sb->s_op->sync_fs)
+                sb->s_op->sync_fs(sb, wait);
+        return __sync_blockdev(sb->s_bdev, wait);
+}
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int sync_filesystem(struct super_block *sb)
+{
+        int ret;
+        /*
+         * We need to be protected against the filesystem going from
+         * r/o to r/w or vice versa.
+         */
+        WARN_ON(!rwsem_is_locked(&sb->s_umount));
+        /*
+         * No point in syncing out anything if the filesystem is read-only.
+         */
+        if (sb->s_flags & MS_RDONLY)
+                return 0;
+        ret = __sync_filesystem(sb, 0);
+        if (ret < 0)
+                return ret;
+        return __sync_filesystem(sb, 1);
+}
+EXPORT_SYMBOL_GPL(sync_filesystem);
+/*
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
+ *
+ * This operation is careful to avoid the livelock which could easily happen
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
+ * is used only here.  We set it against all filesystems and then clear it as
+ * we sync them.  So redirtied filesystems are skipped.
+ *
+ * But if process A is currently running sync_filesystems and then process B
+ * calls sync_filesystems as well, process B will set all the s_need_sync
+ * flags again, which will cause process A to resync everything.  Fix that with
+ * a local mutex.
+ */
+static void sync_filesystems(int wait)
+{
+        struct super_block *sb;
+        static DEFINE_MUTEX(mutex);
+        mutex_lock(&mutex);             /* Could be down_interruptible */
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &super_blocks, s_list)
+                sb->s_need_sync = 1;
+restart:
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (!sb->s_need_sync)
+                        continue;
+                sb->s_need_sync = 0;
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+                        __sync_filesystem(sb, wait);
+                up_read(&sb->s_umount);
+                /* restart only when sb is no longer on the list */
+                spin_lock(&sb_lock);
+                if (__put_super_and_need_restart(sb))
+                        goto restart;
+        }
+        spin_unlock(&sb_lock);
+        mutex_unlock(&mutex);
 }
+/*
+ * sync everything.  Start out by waking pdflush, because that writes back
+ * all queues in parallel.
+ */
 SYSCALL_DEFINE0(sync)
 {
-        do_sync(1);
+        wakeup_pdflush(0);
+        sync_filesystems(0);
+        sync_filesystems(1);
+        if (unlikely(laptop_mode))
+                laptop_sync_completion();
        return 0;
 }
 static void do_sync_work(struct work_struct *work)
 {
-        do_sync(0);
+        /*
+         * Sync twice to reduce the possibility we skipped some inodes / pages
+         * because they were temporarily locked
+         */
+        sync_filesystems(0);
+        sync_filesystems(0);
+        printk("Emergency Sync complete\n");
        kfree(work);
 }
@@ -75,10 +165,8 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
        /* sync the superblock to buffers */
        sb = inode->i_sb;
-        lock_super(sb);
        if (sb->s_dirt && sb->s_op->write_super)
                sb->s_op->write_super(sb);
-        unlock_super(sb);
        /* .. finally sync the buffers to disk */
        err = sync_blockdev(sb->s_bdev);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 9345806c8853..2524714bece1 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -171,6 +171,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
        if (count > 0)
                *off = offs + count;
+        kfree(temp);
        return count;
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d88d0fac9fa5..14f2d71ea3ce 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -939,8 +939,10 @@ again:
        /* Remove from old parent's list and insert into new parent's list. */
        sysfs_unlink_sibling(sd);
        sysfs_get(new_parent_sd);
+        drop_nlink(old_parent->d_inode);
        sysfs_put(sd->s_parent);
        sd->s_parent = new_parent_sd;
+        inc_nlink(new_parent->d_inode);
        sysfs_link_sibling(sd);
 out_unlock:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index a3ba217fbe74..1d897ad808e0 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -192,8 +192,11 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        int error = -ENOMEM;
        unsigned long page = get_zeroed_page(GFP_KERNEL);
-        if (page)
+        if (page) {
                error = sysfs_getlink(dentry, (char *) page); 
+                if (error < 0)
+                        free_page((unsigned long)page);
+        }
        nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
        return NULL;
 }
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 56f655254bfe..4e50286a4cc3 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -15,16 +15,16 @@
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 #include "sysv.h"
 static int sysv_readdir(struct file *, void *, filldir_t);
 const struct file_operations sysv_dir_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = sysv_readdir,
-        .fsync          = sysv_sync_file,
+        .fsync          = simple_fsync,
 };
 static inline void dir_put_page(struct page *page)
@@ -74,8 +74,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
-        lock_kernel();
        pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
        if (pos >= inode->i_size)
                goto done;
@@ -113,7 +111,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
 done:
        filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 589be21d884e..96340c01f4a7 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = sysv_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -34,18 +34,3 @@ const struct inode_operations sysv_file_inode_operations = {
        .truncate       = sysv_truncate,
        .getattr        = sysv_getattr,
 };
-int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        
-        err |= sysv_sync_inode(inode);
-        return err ? -EIO : 0;
-}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index da20b48d350f..9824743832a7 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -21,7 +21,6 @@
 *  the superblock.
 */
-#include <linux/smp_lock.h>
 #include <linux/highuid.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -31,15 +30,12 @@
 #include <asm/byteorder.h>
 #include "sysv.h"
-/* This is only called on sync() and umount(), when s_dirt=1. */
+static int sysv_sync_fs(struct super_block *sb, int wait)
-static void sysv_write_super(struct super_block *sb)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
        unsigned long time = get_seconds(), old_time;
-        lock_kernel();
+        lock_super(sb);
-        if (sb->s_flags & MS_RDONLY)
-                goto clean;
        /*
         * If we are going to write out the super block,
@@ -53,18 +49,29 @@ static void sysv_write_super(struct super_block *sb)
                *sbi->s_sb_time = cpu_to_fs32(sbi, time);
                mark_buffer_dirty(sbi->s_bh2);
        }
-clean:
-        sb->s_dirt = 0;
+        unlock_super(sb);
-        unlock_kernel();
+        return 0;
+}
+static void sysv_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                sysv_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        lock_super(sb);
        if (sbi->s_forced_ro)
                *flags |= MS_RDONLY;
        if (!(*flags & MS_RDONLY))
                sb->s_dirt = 1;
+        unlock_super(sb);
        return 0;
 }
@@ -72,6 +79,9 @@ static void sysv_put_super(struct super_block *sb)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        if (sb->s_dirt)
+                sysv_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                /* XXX ext2 also updates the state here */
                mark_buffer_dirty(sbi->s_bh1);
@@ -236,7 +246,7 @@ bad_inode:
        return ERR_PTR(-EIO);
 }
-static struct buffer_head * sysv_update_inode(struct inode * inode)
+int sysv_write_inode(struct inode *inode, int wait)
 {
        struct super_block * sb = inode->i_sb;
        struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -244,17 +254,18 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
        struct sysv_inode * raw_inode;
        struct sysv_inode_info * si;
        unsigned int ino, block;
+        int err = 0;
        ino = inode->i_ino;
        if (!ino || ino > sbi->s_ninodes) {
                printk("Bad inode number on dev %s: %d is out of range\n",
                       inode->i_sb->s_id, ino);
-                return NULL;
+                return -EIO;
        }
        raw_inode = sysv_raw_inode(sb, ino, &bh);
        if (!raw_inode) {
                printk("unable to read i-node block\n");
-                return NULL;
+                return -EIO;
        }
        raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
@@ -273,37 +284,21 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
                write3byte(sbi, (u8 *)&si->i_data[block],
                        &raw_inode->i_data[3*block]);
        mark_buffer_dirty(bh);
-        return bh;
+        if (wait) {
-}
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-int sysv_write_inode(struct inode * inode, int wait)
+                        printk ("IO error syncing sysv inode [%s:%08x]\n",
-{
+                                sb->s_id, ino);
-        struct buffer_head *bh;
+                        err = -EIO;
-        lock_kernel();
+                }
-        bh = sysv_update_inode(inode);
+        }
        brelse(bh);
-        unlock_kernel();
        return 0;
 }
-int sysv_sync_inode(struct inode * inode)
+int sysv_sync_inode(struct inode *inode)
 {
-        int err = 0;
+        return sysv_write_inode(inode, 1);
-        struct buffer_head *bh;
-        bh = sysv_update_inode(inode);
-        if (bh && buffer_dirty(bh)) {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                        printk ("IO error syncing sysv inode [%s:%08lx]\n",
-                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
-                }
-        }
-        else if (!bh)
-                err = -1;
-        brelse (bh);
-        return err;
 }
 static void sysv_delete_inode(struct inode *inode)
@@ -311,9 +306,7 @@ static void sysv_delete_inode(struct inode *inode)
        truncate_inode_pages(&inode->i_data, 0);
        inode->i_size = 0;
        sysv_truncate(inode);
-        lock_kernel();
        sysv_free_inode(inode);
-        unlock_kernel();
 }
 static struct kmem_cache *sysv_inode_cachep;
@@ -347,6 +340,7 @@ const struct super_operations sysv_sops = {
        .delete_inode   = sysv_delete_inode,
        .put_super      = sysv_put_super,
        .write_super    = sysv_write_super,
+        .sync_fs        = sysv_sync_fs,
        .remount_fs     = sysv_remount,
        .statfs         = sysv_statfs,
 };
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 5784a318c883..53786eb5cf60 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -144,7 +144,6 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, int);
 extern int sysv_sync_inode(struct inode *);
-extern int sysv_sync_file(struct file *, struct dentry *, int);
 extern void sysv_set_inode(struct inode *, dev_t);
 extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int sysv_init_icache(void);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index af1914462f02..eaf6d891d46f 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -91,7 +91,6 @@ static int shrink_liability(struct ubifs_info *c, int nr_to_write)
        return nr_written;
 }
 /**
 * run_gc - run garbage collector.
 * @c: UBIFS file-system description object
@@ -628,7 +627,7 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 *
 * This function releases budget corresponding to a dirty inode. It is usually
 * called when after the inode has been written to the media and marked as
- * clean.
+ * clean. It also causes the "no space" flags to be cleared.
 */
 void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
                                      struct ubifs_inode *ui)
@@ -636,6 +635,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
        struct ubifs_budget_req req;
        memset(&req, 0, sizeof(struct ubifs_budget_req));
+        /* The "no space" flags will be cleared because dd_growth is > 0 */
        req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
        ubifs_release_budget(c, &req);
 }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f55d523c52bb..552fb0111fff 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -528,6 +528,25 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
                inode->i_nlink, dir->i_ino);
        ubifs_assert(mutex_is_locked(&dir->i_mutex));
        ubifs_assert(mutex_is_locked(&inode->i_mutex));
+        /*
+         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
+         * otherwise has the potential to corrupt the orphan inode list.
+         *
+         * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
+         * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
+         * lock 'dirA->i_mutex', so this is possible. Both of the functions
+         * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
+         * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
+         * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
+         * to the list of orphans. After this, 'vfs_link()' will link
+         * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
+         * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
+         * to the list of orphans.
+         */
+         if (inode->i_nlink == 0)
+                 return -ENOENT;
        err = dbg_check_synced_i_size(inode);
        if (err)
                return err;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e8e632a1dcdf..762a7d6cec73 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -293,13 +293,15 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
 *
 * This function is called when the write-buffer timer expires.
 */
-static void wbuf_timer_callback_nolock(unsigned long data)
+static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
 {
-        struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data;
+        struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
+        dbg_io("jhead %d", wbuf->jhead);
        wbuf->need_sync = 1;
        wbuf->c->need_wbuf_sync = 1;
        ubifs_wake_up_bgt(wbuf->c);
+        return HRTIMER_NORESTART;
 }
 /**
@@ -308,13 +310,16 @@ static void wbuf_timer_callback_nolock(unsigned long data)
 */
 static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 {
-        ubifs_assert(!timer_pending(&wbuf->timer));
+        ubifs_assert(!hrtimer_active(&wbuf->timer));
-        if (!wbuf->timeout)
+        if (wbuf->no_timer)
                return;
+        dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead,
-        wbuf->timer.expires = jiffies + wbuf->timeout;
+               div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
-        add_timer(&wbuf->timer);
+               div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
+                       USEC_PER_SEC));
+        hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta,
+                               HRTIMER_MODE_REL);
 }
 /**
@@ -323,13 +328,10 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 */
 static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 {
-        /*
+        if (wbuf->no_timer)
-         * If the syncer is waiting for the lock (from the background thread's
+                return;
-         * context) and another task is changing write-buffer then the syncing
-         * should be canceled.
-         */
        wbuf->need_sync = 0;
-        del_timer(&wbuf->timer);
+        hrtimer_cancel(&wbuf->timer);
 }
 /**
@@ -349,8 +351,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
                /* Write-buffer is empty or not seeked */
                return 0;
-        dbg_io("LEB %d:%d, %d bytes",
+        dbg_io("LEB %d:%d, %d bytes, jhead %d",
-               wbuf->lnum, wbuf->offs, wbuf->used);
+               wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead);
        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
        ubifs_assert(!(wbuf->avail & 7));
        ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
@@ -390,7 +392,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 * @offs: logical eraseblock offset to seek to
 * @dtype: data type
 *
- * This function targets the write buffer to logical eraseblock @lnum:@offs.
+ * This function targets the write-buffer to logical eraseblock @lnum:@offs.
 * The write-buffer is synchronized if it is not empty. Returns zero in case of
 * success and a negative error code in case of failure.
 */
@@ -399,7 +401,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
 {
        const struct ubifs_info *c = wbuf->c;
-        dbg_io("LEB %d:%d", lnum, offs);
+        dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead);
        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
        ubifs_assert(offs >= 0 && offs <= c->leb_size);
        ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
@@ -506,9 +508,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        struct ubifs_info *c = wbuf->c;
        int err, written, n, aligned_len = ALIGN(len, 8), offs;
-        dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len,
+        dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len,
-               dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum,
+               dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead,
-               wbuf->offs + wbuf->used);
+               wbuf->lnum, wbuf->offs + wbuf->used);
        ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
        ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
        ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
@@ -533,8 +535,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                memcpy(wbuf->buf + wbuf->used, buf, len);
                if (aligned_len == wbuf->avail) {
-                        dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum,
+                        dbg_io("flush jhead %d wbuf to LEB %d:%d",
-                                wbuf->offs);
+                               wbuf->jhead, wbuf->lnum, wbuf->offs);
                        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
                                            wbuf->offs, c->min_io_size,
                                            wbuf->dtype);
@@ -562,7 +564,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
         * minimal I/O unit. We have to fill and flush write-buffer and switch
         * to the next min. I/O unit.
         */
-        dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs);
+        dbg_io("flush jhead %d wbuf to LEB %d:%d",
+               wbuf->jhead, wbuf->lnum, wbuf->offs);
        memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
                            c->min_io_size, wbuf->dtype);
@@ -695,7 +698,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
        int err, rlen, overlap;
        struct ubifs_ch *ch = buf;
-        dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+        dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs,
+               dbg_ntype(type), len, wbuf->jhead);
        ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
        ubifs_assert(!(offs & 7) && offs < c->leb_size);
        ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
@@ -819,7 +823,7 @@ out:
 * @c: UBIFS file-system description object
 * @wbuf: write-buffer to initialize
 *
- * This function initializes write buffer. Returns zero in case of success
+ * This function initializes write-buffer. Returns zero in case of success
 * %-ENOMEM in case of failure.
 */
 int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
@@ -845,20 +849,21 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
        wbuf->sync_callback = NULL;
        mutex_init(&wbuf->io_mutex);
        spin_lock_init(&wbuf->lock);
        wbuf->c = c;
-        init_timer(&wbuf->timer);
-        wbuf->timer.function = wbuf_timer_callback_nolock;
-        wbuf->timer.data = (unsigned long)wbuf;
-        wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
        wbuf->next_ino = 0;
+        hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        wbuf->timer.function = wbuf_timer_callback_nolock;
+        wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0);
+        wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT;
+        wbuf->delta *= 1000000000ULL;
+        ubifs_assert(wbuf->delta <= ULONG_MAX);
        return 0;
 }
 /**
 * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
- * @wbuf: the write-buffer whereto add
+ * @wbuf: the write-buffer where to add
 * @inum: the inode number
 *
 * This function adds an inode number to the inode array of the write-buffer.
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 6db7a6be6c97..8aacd64957a2 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -25,7 +25,6 @@
 /* This file implements EXT2-compatible extended attribute ioctl() calls */
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include "ubifs.h"
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 10662975d2ef..e5f6cf8a1155 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -53,6 +53,25 @@ static int is_empty(void *buf, int len)
 }
 /**
+ * first_non_ff - find offset of the first non-0xff byte.
+ * @buf: buffer to search in
+ * @len: length of buffer
+ *
+ * This function returns offset of the first non-0xff byte in @buf or %-1 if
+ * the buffer contains only 0xff bytes.
+ */
+static int first_non_ff(void *buf, int len)
+{
+        uint8_t *p = buf;
+        int i;
+        for (i = 0; i < len; i++)
+                if (*p++ != 0xff)
+                        return i;
+        return -1;
+}
+/**
 * get_master_node - get the last valid master node allowing for corruption.
 * @c: UBIFS file-system description object
 * @lnum: LEB number
@@ -343,43 +362,21 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
 *
 * This function returns %1 if @offs was in the last write to the LEB whose data
 * is in @buf, otherwise %0 is returned.  The determination is made by checking
- * for subsequent empty space starting from the next min_io_size boundary (or a
+ * for subsequent empty space starting from the next @c->min_io_size boundary.
- * bit less than the common header size if min_io_size is one).
 */
 static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
 {
-        int empty_offs;
+        int empty_offs, check_len;
-        int check_len;
        uint8_t *p;
-        if (c->min_io_size == 1) {
-                check_len = c->leb_size - offs;
-                p = buf + check_len;
-                for (; check_len > 0; check_len--)
-                        if (*--p != 0xff)
-                                break;
-                /*
-                 * 'check_len' is the size of the corruption which cannot be
-                 * more than the size of 1 node if it was caused by an unclean
-                 * unmount.
-                 */
-                if (check_len > UBIFS_MAX_NODE_SZ)
-                        return 0;
-                return 1;
-        }
        /*
-         * Round up to the next c->min_io_size boundary i.e. 'offs' is in the
+         * Round up to the next @c->min_io_size boundary i.e. @offs is in the
         * last wbuf written. After that should be empty space.
         */
        empty_offs = ALIGN(offs + 1, c->min_io_size);
        check_len = c->leb_size - empty_offs;
        p = buf + empty_offs - offs;
+        return is_empty(p, check_len);
-        for (; check_len > 0; check_len--)
-                if (*p++ != 0xff)
-                        return 0;
-        return 1;
 }
 /**
@@ -392,7 +389,7 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
 *
 * This function pads up to the next min_io_size boundary (if there is one) and
 * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
- * min_io_size boundary (if there is one).
+ * @c->min_io_size boundary.
 */
 static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
                      int *offs, int *len)
@@ -402,11 +399,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
        lnum = lnum;
        dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
-        if (c->min_io_size == 1) {
-                memset(*buf, 0xff, c->leb_size - *offs);
-                return;
-        }
        ubifs_assert(!(*offs & 7));
        empty_offs = ALIGN(*offs, c->min_io_size);
        pad_len = empty_offs - *offs;
@@ -566,8 +558,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
 *
 * This function does a scan of a LEB, but caters for errors that might have
 * been caused by the unclean unmount from which we are attempting to recover.
- *
+ * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is
- * This function returns %0 on success and a negative error code on failure.
+ * found, and a negative error code in case of failure.
 */
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                                         int offs, void *sbuf, int grouped)
@@ -666,7 +658,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                        goto corrupted;
                default:
                        dbg_err("unknown");
-                        goto corrupted;
+                        err = -EINVAL;
+                        goto error;
                }
        }
@@ -675,8 +668,13 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                        clean_buf(c, &buf, lnum, &offs, &len);
                        need_clean = 1;
                } else {
-                        ubifs_err("corrupt empty space at LEB %d:%d",
+                        int corruption = first_non_ff(buf, len);
-                                  lnum, offs);
+                        ubifs_err("corrupt empty space LEB %d:%d, corruption "
+                                  "starts at %d", lnum, offs, corruption);
+                        /* Make sure we dump interesting non-0xFF data */
+                        offs = corruption;
+                        buf += corruption;
                        goto corrupted;
                }
        }
@@ -836,7 +834,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
 static int recover_head(const struct ubifs_info *c, int lnum, int offs,
                        void *sbuf)
 {
-        int len, err, need_clean = 0;
+        int len, err;
        if (c->min_io_size > 1)
                len = c->min_io_size;
@@ -850,19 +848,7 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs,
        /* Read at the head location and check it is empty flash */
        err = ubi_read(c->ubi, lnum, sbuf, offs, len);
-        if (err)
+        if (err || !is_empty(sbuf, len)) {
-                need_clean = 1;
-        else {
-                uint8_t *p = sbuf;
-                while (len--)
-                        if (*p++ != 0xff) {
-                                need_clean = 1;
-                                break;
-                        }
-        }
-        if (need_clean) {
                dbg_rcvry("cleaning head at %d:%d", lnum, offs);
                if (offs == 0)
                        return ubifs_leb_unmap(c, lnum);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 11cc80125a49..2970500f32df 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -837,9 +837,10 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
        dbg_mnt("replay log LEB %d:%d", lnum, offs);
        sleb = ubifs_scan(c, lnum, offs, sbuf);
-        if (IS_ERR(sleb)) {
+        if (IS_ERR(sleb) ) {
-                if (c->need_recovery)
+                if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
-                        sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
+                        return PTR_ERR(sleb);
+                sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
                if (IS_ERR(sleb))
                        return PTR_ERR(sleb);
        }
@@ -957,7 +958,7 @@ out:
        return err;
 out_dump:
-        ubifs_err("log error detected while replying the log at LEB %d:%d",
+        ubifs_err("log error detected while replaying the log at LEB %d:%d",
                  lnum, offs + snod->offs);
        dbg_dump_node(c, snod->node);
        ubifs_scan_destroy(sleb);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 0ed82479b44b..892ebfee4fe5 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -238,12 +238,12 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
 {
        int len;
-        ubifs_err("corrupted data at LEB %d:%d", lnum, offs);
+        ubifs_err("corruption at LEB %d:%d", lnum, offs);
        if (dbg_failure_mode)
                return;
        len = c->leb_size - offs;
-        if (len > 4096)
+        if (len > 8192)
-                len = 4096;
+                len = 8192;
        dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
        print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
 }
@@ -256,7 +256,9 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
 * @sbuf: scan buffer (must be c->leb_size)
 *
 * This function scans LEB number @lnum and returns complete information about
- * its contents. Returns an error code in case of failure.
+ * its contents. Returns the scaned information in case of success and,
+ * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
+ * of failure.
 */
 struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                                  int offs, void *sbuf)
@@ -279,7 +281,6 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                cond_resched();
                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
                if (ret > 0) {
                        /* Padding bytes or a valid padding node */
                        offs += ret;
@@ -304,7 +305,8 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                        goto corrupted;
                default:
                        dbg_err("unknown");
-                        goto corrupted;
+                        err = -EINVAL;
+                        goto error;
                }
                err = ubifs_add_snod(c, sleb, buf, offs);
@@ -317,8 +319,10 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                len -= node_len;
        }
-        if (offs % c->min_io_size)
+        if (offs % c->min_io_size) {
-                goto corrupted;
+                ubifs_err("empty space starts at non-aligned offset %d", offs);
+                goto corrupted;;
+        }
        ubifs_end_scan(c, sleb, lnum, offs);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e9f7a754c4f7..26d2e0d80465 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,7 @@
 #include <linux/mount.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
+#include <linux/smp_lock.h>
 #include "ubifs.h"
 /*
@@ -360,6 +361,11 @@ static void ubifs_delete_inode(struct inode *inode)
 out:
        if (ui->dirty)
                ubifs_release_dirty_inode_budget(c, ui);
+        else {
+                /* We've deleted something - clean the "no space" flags */
+                c->nospace = c->nospace_rp = 0;
+                smp_wmb();
+        }
        clear_inode(inode);
 }
@@ -447,9 +453,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
        if (!wait)
                return 0;
-        if (sb->s_flags & MS_RDONLY)
-                return 0;
        /*
         * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
         * pages, so synchronize them first, then commit the journal. Strictly
@@ -794,7 +797,7 @@ static int alloc_wbufs(struct ubifs_info *c)
         * does not need to be synchronized by timer.
         */
        c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
-        c->jheads[GCHD].wbuf.timeout = 0;
+        c->jheads[GCHD].wbuf.no_timer = 1;
        return 0;
 }
@@ -935,6 +938,27 @@ static const match_table_t tokens = {
 };
 /**
+ * parse_standard_option - parse a standard mount option.
+ * @option: the option to parse
+ *
+ * Normally, standard mount options like "sync" are passed to file-systems as
+ * flags. However, when a "rootflags=" kernel boot parameter is used, they may
+ * be present in the options string. This function tries to deal with this
+ * situation and parse standard options. Returns 0 if the option was not
+ * recognized, and the corresponding integer flag if it was.
+ *
+ * UBIFS is only interested in the "sync" option, so do not check for anything
+ * else.
+ */
+static int parse_standard_option(const char *option)
+{
+        ubifs_msg("parse %s", option);
+        if (!strcmp(option, "sync"))
+                return MS_SYNCHRONOUS;
+        return 0;
+}
+/**
 * ubifs_parse_options - parse mount parameters.
 * @c: UBIFS file-system description object
 * @options: parameters to parse
@@ -962,7 +986,7 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
                switch (token) {
                /*
                 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
-                 * We accepte them in order to be backware-compatible. But this
+                 * We accept them in order to be backward-compatible. But this
                 * should be removed at some point.
                 */
                case Opt_fast_unmount:
@@ -1010,9 +1034,19 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
                        break;
                }
                default:
-                        ubifs_err("unrecognized mount option \"%s\" "
+                {
-                                  "or missing value", p);
+                        unsigned long flag;
-                        return -EINVAL;
+                        struct super_block *sb = c->vfs_sb;
+                        flag = parse_standard_option(p);
+                        if (!flag) {
+                                ubifs_err("unrecognized mount option \"%s\" "
+                                          "or missing value", p);
+                                return -EINVAL;
+                        }
+                        sb->s_flags |= flag;
+                        break;
+                }
                }
        }
@@ -1182,6 +1216,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (!ubifs_compr_present(c->default_compr)) {
                ubifs_err("'compressor \"%s\" is not compiled in",
                          ubifs_compr_name(c->default_compr));
+                err = -ENOTSUPP;
                goto out_free;
        }
@@ -1252,6 +1287,9 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_journal;
+        /* Calculate 'min_idx_lebs' after journal replay */
+        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
        if (err)
                goto out_orphans;
@@ -1658,7 +1696,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        for (i = 0; i < c->jhead_cnt; i++) {
                ubifs_wbuf_sync(&c->jheads[i].wbuf);
-                del_timer_sync(&c->jheads[i].wbuf.timer);
+                hrtimer_cancel(&c->jheads[i].wbuf.timer);
        }
        c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
@@ -1687,6 +1725,9 @@ static void ubifs_put_super(struct super_block *sb)
        ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
                  c->vi.vol_id);
+        lock_kernel();
        /*
         * The following asserts are only valid if there has not been a failure
         * of the media. For example, there will be dirty inodes if we failed
@@ -1716,10 +1757,8 @@ static void ubifs_put_super(struct super_block *sb)
                /* Synchronize write-buffers */
                if (c->jheads)
-                        for (i = 0; i < c->jhead_cnt; i++) {
+                        for (i = 0; i < c->jhead_cnt; i++)
                                ubifs_wbuf_sync(&c->jheads[i].wbuf);
-                                del_timer_sync(&c->jheads[i].wbuf.timer);
-                        }
                /*
                 * On fatal errors c->ro_media is set to 1, in which case we do
@@ -1753,6 +1792,8 @@ static void ubifs_put_super(struct super_block *sb)
        ubi_close_volume(c->ubi);
        mutex_unlock(&c->umount_mutex);
        kfree(c);
+        unlock_kernel();
 }
 static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1768,17 +1809,22 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                return err;
        }
+        lock_kernel();
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
                if (c->ro_media) {
                        ubifs_msg("cannot re-mount due to prior errors");
+                        unlock_kernel();
                        return -EROFS;
                }
                err = ubifs_remount_rw(c);
-                if (err)
+                if (err) {
+                        unlock_kernel();
                        return err;
+                }
        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
                if (c->ro_media) {
                        ubifs_msg("cannot re-mount due to prior errors");
+                        unlock_kernel();
                        return -EROFS;
                }
                ubifs_remount_ro(c);
@@ -1793,6 +1839,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
        }
        ubifs_assert(c->lst.taken_empty_lebs > 0);
+        unlock_kernel();
        return 0;
 }
@@ -1902,6 +1949,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&c->orph_list);
        INIT_LIST_HEAD(&c->orph_new);
+        c->vfs_sb = sb;
        c->highest_inum = UBIFS_FIRST_INO;
        c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
@@ -1928,18 +1976,19 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        err  = bdi_init(&c->bdi);
        if (err)
                goto out_close;
+        err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
+                           c->vi.ubi_num, c->vi.vol_id);
+        if (err)
+                goto out_bdi;
        err = ubifs_parse_options(c, data, 0);
        if (err)
                goto out_bdi;
-        c->vfs_sb = sb;
        sb->s_fs_info = c;
        sb->s_magic = UBIFS_SUPER_MAGIC;
        sb->s_blocksize = UBIFS_BLOCK_SIZE;
        sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
-        sb->s_dev = c->vi.cdev;
        sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
        if (c->max_inode_sz > MAX_LFS_FILESIZE)
                sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
@@ -1984,16 +2033,9 @@ out_free:
 static int sb_test(struct super_block *sb, void *data)
 {
        dev_t *dev = data;
+        struct ubifs_info *c = sb->s_fs_info;
-        return sb->s_dev == *dev;
+        return c->vi.cdev == *dev;
-}
-static int sb_set(struct super_block *sb, void *data)
-{
-        dev_t *dev = data;
-        sb->s_dev = *dev;
-        return 0;
 }
 static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
@@ -2021,7 +2063,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
        dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
-        sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev);
+        sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev);
        if (IS_ERR(sb)) {
                err = PTR_ERR(sb);
                goto out_close;
@@ -2061,16 +2103,11 @@ out_close:
        return err;
 }
-static void ubifs_kill_sb(struct super_block *sb)
-{
-        generic_shutdown_super(sb);
-}
 static struct file_system_type ubifs_fs_type = {
        .name    = "ubifs",
        .owner   = THIS_MODULE,
        .get_sb  = ubifs_get_sb,
-        .kill_sb = ubifs_kill_sb
+        .kill_sb = kill_anon_super,
 };
 /*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0a8341e14088..a29349094422 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -95,8 +95,9 @@
 */
 #define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
-/* Default write-buffer synchronization timeout (5 secs) */
+/* Write-buffer synchronization timeout interval in seconds */
-#define DEFAULT_WBUF_TIMEOUT (5 * HZ)
+#define WBUF_TIMEOUT_SOFTLIMIT 3
+#define WBUF_TIMEOUT_HARDLIMIT 5
 /* Maximum possible inode number (only 32-bit inodes are supported now) */
 #define MAX_INUM 0xFFFFFFFF
@@ -650,9 +651,12 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
 * @io_mutex: serializes write-buffer I/O
 * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
 *        fields
+ * @softlimit: soft write-buffer timeout interval
+ * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit
+ *         and @softlimit + @delta)
 * @timer: write-buffer timer
- * @timeout: timer expire interval in jiffies
+ * @no_timer: non-zero if this write-buffer does not have a timer
- * @need_sync: it is set if its timer expired and needs sync
+ * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing
 * @next_ino: points to the next position of the following inode number
 * @inodes: stores the inode numbers of the nodes which are in wbuf
 *
@@ -678,9 +682,11 @@ struct ubifs_wbuf {
        int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
        struct mutex io_mutex;
        spinlock_t lock;
-        struct timer_list timer;
+        ktime_t softlimit;
-        int timeout;
+        unsigned long long delta;
-        int need_sync;
+        struct hrtimer timer;
+        unsigned int no_timer:1;
+        unsigned int need_sync:1;
        int next_ino;
        ino_t *inodes;
 };
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index cfd31e229c89..adafcf556531 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -55,9 +55,9 @@
 * ACL support is not implemented.
 */
+#include "ubifs.h"
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
-#include "ubifs.h"
 /*
 * Limit the number of extended attributes per inode so that the total size
diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index 0d4503f7446d..eb880f66c23a 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_UDF_FS) += udf.o
 udf-objs     := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
-                partition.o super.o truncate.o symlink.o fsync.o \
+                partition.o super.o truncate.o symlink.o \
                directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index e48e9a3af763..1e068535b58b 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -238,7 +238,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
        mutex_lock(&sbi->s_alloc_mutex);
        part_len = sbi->s_partmaps[partition].s_partition_len;
-        if (first_block < 0 || first_block >= part_len)
+        if (first_block >= part_len)
                goto out;
        if (first_block + block_count > part_len)
@@ -297,7 +297,7 @@ static int udf_bitmap_new_block(struct super_block *sb,
        mutex_lock(&sbi->s_alloc_mutex);
 repeat:
-        if (goal < 0 || goal >= sbi->s_partmaps[partition].s_partition_len)
+        if (goal >= sbi->s_partmaps[partition].s_partition_len)
                goal = 0;
        nr_groups = bitmap->s_nr_groups;
@@ -666,8 +666,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
        int8_t etype = -1;
        struct udf_inode_info *iinfo;
-        if (first_block < 0 ||
+        if (first_block >= sbi->s_partmaps[partition].s_partition_len)
-                first_block >= sbi->s_partmaps[partition].s_partition_len)
                return 0;
        iinfo = UDF_I(table);
@@ -743,7 +742,7 @@ static int udf_table_new_block(struct super_block *sb,
                return newblock;
        mutex_lock(&sbi->s_alloc_mutex);
-        if (goal < 0 || goal >= sbi->s_partmaps[partition].s_partition_len)
+        if (goal >= sbi->s_partmaps[partition].s_partition_len)
                goal = 0;
        /* We search for the closest matching block to goal. If we find
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 2efd4d5291b6..61d9a76a3a69 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = {
        .read                   = generic_read_dir,
        .readdir                = udf_readdir,
        .ioctl                  = udf_ioctl,
-        .fsync                  = udf_fsync_file,
+        .fsync                  = simple_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index eb91f3b70320..7464305382b5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = {
        .write                  = do_sync_write,
        .aio_write              = udf_file_aio_write,
        .release                = udf_release_file,
-        .fsync                  = udf_fsync_file,
+        .fsync                  = simple_fsync,
        .splice_read            = generic_file_splice_read,
        .llseek                 = generic_file_llseek,
 };
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
deleted file mode 100644
index b2c472b733b8..000000000000
--- a/fs/udf/fsync.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * fsync.c
- *
- * PURPOSE
- *  Fsync handling routines for the OSTA-UDF(tm) filesystem.
- *
- * COPYRIGHT
- *  This file is distributed under the terms of the GNU General Public
- *  License (GPL). Copies of the GPL can be obtained from:
- *      ftp://prep.ai.mit.edu/pub/gnu/GPL
- *  Each contributing author retains all rights to their own work.
- *
- *  (C) 1999-2001 Ben Fennema
- *  (C) 1999-2000 Stelias Computing Inc
- *
- * HISTORY
- *
- *  05/22/99 blf  Created.
- */
-#include "udfdecl.h"
-#include <linux/fs.h>
-static int udf_fsync_inode(struct inode *, int);
-/*
- *      File may be NULL when we are called. Perhaps we shouldn't
- *      even pass file to fsync ?
- */
-int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        return udf_fsync_inode(inode, datasync);
-}
-static int udf_fsync_inode(struct inode *inode, int datasync)
-{
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        err |= udf_sync_inode(inode);
-        return err ? -EIO : 0;
-}
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 703843f30ffd..1b88fd5df05d 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -56,7 +56,12 @@ unsigned long udf_get_last_block(struct super_block *sb)
        struct block_device *bdev = sb->s_bdev;
        unsigned long lblock = 0;
-        if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock))
+        /*
+         * ioctl failed or returned obviously bogus value?
+         * Try using the device size...
+         */
+        if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) ||
+            lblock == 0)
                lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        if (lblock)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc855a4..9d1b8c2e6c45 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -568,6 +568,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
+        lock_kernel();
        sbi->s_flags = uopt.flags;
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
@@ -581,13 +582,16 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                        *flags |= MS_RDONLY;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+                unlock_kernel();
                return 0;
+        }
        if (*flags & MS_RDONLY)
                udf_close_lvid(sb);
        else
                udf_open_lvid(sb);
+        unlock_kernel();
        return 0;
 }
@@ -1083,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
        struct udf_inode_info *vati;
        uint32_t pos;
        struct virtualAllocationTable20 *vat20;
+        sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        /* VAT file entry is in the last recorded block */
        ino.partitionReferenceNum = type1_index;
        ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
        sbi->s_vat_inode = udf_iget(sb, &ino);
+        if (!sbi->s_vat_inode &&
+            sbi->s_last_block != blocks - 1) {
+                printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
+                       " last recorded block (%lu), retrying with the last "
+                       "block of the device (%lu).\n",
+                       (unsigned long)sbi->s_last_block,
+                       (unsigned long)blocks - 1);
+                ino.partitionReferenceNum = type1_index;
+                ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
+                sbi->s_vat_inode = udf_iget(sb, &ino);
+        }
        if (!sbi->s_vat_inode)
                return 1;
@@ -1915,7 +1931,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
        } else {
-                uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+                uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
                if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
                        if (!silent)
@@ -2062,6 +2078,9 @@ static void udf_put_super(struct super_block *sb)
        struct udf_sb_info *sbi;
        sbi = UDF_SB(sb);
+        lock_kernel();
        if (sbi->s_vat_inode)
                iput(sbi->s_vat_inode);
        if (sbi->s_partitions)
@@ -2077,6 +2096,8 @@ static void udf_put_super(struct super_block *sb)
        kfree(sbi->s_partmaps);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int udf_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index cac51b77a5d1..8d46f4294ee7 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -223,9 +223,6 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
                         uint32_t, int *);
-/* fsync.c */
-extern int udf_fsync_file(struct file *, struct dentry *, int);
 /* directory.c */
 extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
                                                struct udf_fileident_bh *,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6321b797061b..6f671f1ac271 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = ufs_readdir,
-        .fsync          = ufs_sync_file,
+        .fsync          = simple_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 2bd3a1615714..73655c61240a 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,31 +24,10 @@
 */
 #include <linux/fs.h>
-#include <linux/buffer_head.h>  /* for sync_mapping_buffers() */
 #include "ufs_fs.h"
 #include "ufs.h"
-int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        int ret;
-        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return ret;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return ret;
-        err = ufs_sync_inode(inode);
-        if (ret == 0)
-                ret = err;
-        return ret;
-}
 /*
 * We have mostly NULL's here: the current defaults are ok for
 * the ufs filesystem.
@@ -62,6 +41,6 @@ const struct file_operations ufs_file_operations = {
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,
-        .fsync          = ufs_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3d2512c21f05..7cf33379fd46 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -56,9 +56,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
        UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
-        if (i_block < 0) {
+        if (i_block < direct_blocks) {
-                ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
-        } else if (i_block < direct_blocks) {
                offsets[n++] = i_block;
        } else if ((i_block -= direct_blocks) < indirect_blocks) {
                offsets[n++] = UFS_IND_BLOCK;
@@ -440,8 +438,6 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
        lock_kernel();
        UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
-        if (fragment < 0)
-                goto abort_negative;
        if (fragment >
            ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb)
             << uspi->s_fpbshift))
@@ -504,10 +500,6 @@ abort:
        unlock_kernel();
        return err;
-abort_negative:
-        ufs_warning(sb, "ufs_get_block", "block < 0");
-        goto abort;
 abort_too_big:
        ufs_warning(sb, "ufs_get_block", "block > big");
        goto abort;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 60359291761f..5faed7954d0a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -263,6 +263,7 @@ void ufs_panic (struct super_block * sb, const char * function,
        struct ufs_super_block_first * usb1;
        va_list args;
        
+        lock_kernel();
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        
@@ -594,6 +595,9 @@ static void ufs_put_super_internal(struct super_block *sb)
        
        UFSD("ENTER\n");
+        lock_kernel();
        ufs_put_cstotal(sb);
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -621,6 +625,9 @@ static void ufs_put_super_internal(struct super_block *sb)
                brelse (sbi->s_ucg[i]);
        kfree (sbi->s_ucg);
        kfree (base);
+        unlock_kernel();
        UFSD("EXIT\n");
 }
@@ -1118,32 +1125,45 @@ failed_nomem:
        return -ENOMEM;
 }
-static void ufs_write_super(struct super_block *sb)
+static int ufs_sync_fs(struct super_block *sb, int wait)
 {
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
        struct ufs_super_block_third * usb3;
        unsigned flags;
+        lock_super(sb);
        lock_kernel();
        UFSD("ENTER\n");
        flags = UFS_SB(sb)->s_flags;
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        usb3 = ubh_get_usb_third(uspi);
-        if (!(sb->s_flags & MS_RDONLY)) {
+        usb1->fs_time = cpu_to_fs32(sb, get_seconds());
-                usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+        if ((flags & UFS_ST_MASK) == UFS_ST_SUN  ||
-                if ((flags & UFS_ST_MASK) == UFS_ST_SUN 
+            (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
-                  || (flags & UFS_ST_MASK) == UFS_ST_SUNOS
+            (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-                  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+                ufs_set_fs_state(sb, usb1, usb3,
-                        ufs_set_fs_state(sb, usb1, usb3,
+                                UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-                                        UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+        ufs_put_cstotal(sb);
-                ufs_put_cstotal(sb);
-        }
        sb->s_dirt = 0;
        UFSD("EXIT\n");
        unlock_kernel();
+        unlock_super(sb);
+        return 0;
+}
+static void ufs_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                ufs_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static void ufs_put_super(struct super_block *sb)
@@ -1152,6 +1172,9 @@ static void ufs_put_super(struct super_block *sb)
                
        UFSD("ENTER\n");
+        if (sb->s_dirt)
+                ufs_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY))
                ufs_put_super_internal(sb);
        
@@ -1171,7 +1194,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        struct ufs_super_block_third * usb3;
        unsigned new_mount_opt, ufstype;
        unsigned flags;
-        
+        lock_kernel();
+        lock_super(sb);
        uspi = UFS_SB(sb)->s_uspi;
        flags = UFS_SB(sb)->s_flags;
        usb1 = ubh_get_usb_first(uspi);
@@ -1184,17 +1209,24 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
        new_mount_opt = 0;
        ufs_set_opt (new_mount_opt, ONERROR_LOCK);
-        if (!ufs_parse_options (data, &new_mount_opt))
+        if (!ufs_parse_options (data, &new_mount_opt)) {
+                unlock_super(sb);
+                unlock_kernel();
                return -EINVAL;
+        }
        if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
                new_mount_opt |= ufstype;
        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                printk("ufstype can't be changed during remount\n");
+                unlock_super(sb);
+                unlock_kernel();
                return -EINVAL;
        }
        if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                UFS_SB(sb)->s_mount_opt = new_mount_opt;
+                unlock_super(sb);
+                unlock_kernel();
                return 0;
        }
        
@@ -1219,6 +1251,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 #ifndef CONFIG_UFS_FS_WRITE
                printk("ufs was compiled with read-only support, "
                "can't be mounted as read-write\n");
+                unlock_super(sb);
+                unlock_kernel();
                return -EINVAL;
 #else
                if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1227,16 +1261,22 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
                    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
                        printk("this ufstype is read-only supported\n");
+                        unlock_super(sb);
+                        unlock_kernel();
                        return -EINVAL;
                }
                if (!ufs_read_cylinder_structures(sb)) {
                        printk("failed during remounting\n");
+                        unlock_super(sb);
+                        unlock_kernel();
                        return -EPERM;
                }
                sb->s_flags &= ~MS_RDONLY;
 #endif
        }
        UFS_SB(sb)->s_mount_opt = new_mount_opt;
+        unlock_super(sb);
+        unlock_kernel();
        return 0;
 }
@@ -1352,6 +1392,7 @@ static const struct super_operations ufs_super_ops = {
        .delete_inode   = ufs_delete_inode,
        .put_super      = ufs_put_super,
        .write_super    = ufs_write_super,
+        .sync_fs        = ufs_sync_fs,
        .statfs         = ufs_statfs,
        .remount_fs     = ufs_remount,
        .show_options   = ufs_show_options,
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index d0c4acd4f1f3..644e77e13599 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -99,7 +99,6 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 extern const struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
 extern const struct address_space_operations ufs_aops;
-extern int ufs_sync_file(struct file *, struct dentry *, int);
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index d51b8f9db921..1c3d0af59ddf 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                return error;
        dentry = f->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = mnt_want_write(f->f_path.mnt);
+        error = mnt_want_write_file(f);
        if (!error) {
                error = setxattr(dentry, name, value, size, flags);
                mnt_drop_write(f->f_path.mnt);
@@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
                return error;
        dentry = f->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = mnt_want_write(f->f_path.mnt);
+        error = mnt_want_write_file(f);
        if (!error) {
                error = removexattr(dentry, name);
                mnt_drop_write(f->f_path.mnt);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 29228f5899cd..480f28127f09 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -39,6 +39,7 @@ config XFS_QUOTA
 config XFS_POSIX_ACL
        bool "XFS POSIX ACL support"
        depends on XFS_FS
+        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
          groups beyond the owner/group/world scheme.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 60f107e47fe9..7a59daed1782 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -40,7 +40,7 @@ xfs-$(CONFIG_PROC_FS)		+= quota/xfs_qm_stats.o
 endif
 xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
+xfs-$(CONFIG_XFS_POSIX_ACL)     += $(XFS_LINUX)/xfs_acl.o
 xfs-$(CONFIG_PROC_FS)           += $(XFS_LINUX)/xfs_stats.o
 xfs-$(CONFIG_SYSCTL)            += $(XFS_LINUX)/xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)            += $(XFS_LINUX)/xfs_ioctl32.o
@@ -88,8 +88,7 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_utils.o \
                                   xfs_vnodeops.o \
                                   xfs_rw.o \
-                                   xfs_dmops.o \
+                                   xfs_dmops.o
-                                   xfs_qmops.o
 xfs-$(CONFIG_XFS_TRACE)         += xfs_btree_trace.o \
                                   xfs_dir2_trace.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 1cd3b55ee3d2..2d3f90afe5f1 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -53,7 +53,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
                                        __func__, lflags);
-                congestion_wait(WRITE, HZ/50);
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
 }
@@ -130,7 +130,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
                                        __func__, lflags);
-                congestion_wait(WRITE, HZ/50);
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
new file mode 100644
index 000000000000..b23a54506446
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_vnodeops.h"
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+/*
+ * Locking scheme:
+ *  - all ACL updates are protected by inode->i_mutex, which is taken before
+ *    calling into this file.
+ */
+STATIC struct posix_acl *
+xfs_acl_from_disk(struct xfs_acl *aclp)
+{
+        struct posix_acl_entry *acl_e;
+        struct posix_acl *acl;
+        struct xfs_acl_entry *ace;
+        int count, i;
+        count = be32_to_cpu(aclp->acl_cnt);
+        acl = posix_acl_alloc(count, GFP_KERNEL);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        for (i = 0; i < count; i++) {
+                acl_e = &acl->a_entries[i];
+                ace = &aclp->acl_entry[i];
+                /*
+                 * The tag is 32 bits on disk and 16 bits in core.
+                 *
+                 * Because every access to it goes through the core
+                 * format first this is not a problem.
+                 */
+                acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+                acl_e->e_perm = be16_to_cpu(ace->ae_perm);
+                switch (acl_e->e_tag) {
+                case ACL_USER:
+                case ACL_GROUP:
+                        acl_e->e_id = be32_to_cpu(ace->ae_id);
+                        break;
+                case ACL_USER_OBJ:
+                case ACL_GROUP_OBJ:
+                case ACL_MASK:
+                case ACL_OTHER:
+                        acl_e->e_id = ACL_UNDEFINED_ID;
+                        break;
+                default:
+                        goto fail;
+                }
+        }
+        return acl;
+fail:
+        posix_acl_release(acl);
+        return ERR_PTR(-EINVAL);
+}
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
+{
+        const struct posix_acl_entry *acl_e;
+        struct xfs_acl_entry *ace;
+        int i;
+        aclp->acl_cnt = cpu_to_be32(acl->a_count);
+        for (i = 0; i < acl->a_count; i++) {
+                ace = &aclp->acl_entry[i];
+                acl_e = &acl->a_entries[i];
+                ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+                ace->ae_id = cpu_to_be32(acl_e->e_id);
+                ace->ae_perm = cpu_to_be16(acl_e->e_perm);
+        }
+}
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        struct posix_acl *acl;
+        struct xfs_acl *xfs_acl;
+        int len = sizeof(struct xfs_acl);
+        char *ea_name;
+        int error;
+        acl = get_cached_acl(inode, type);
+        if (acl != ACL_NOT_CACHED)
+                return acl;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                ea_name = SGI_ACL_FILE;
+                break;
+        case ACL_TYPE_DEFAULT:
+                ea_name = SGI_ACL_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        /*
+         * If we have a cached ACLs value just return it, not need to
+         * go out to the disk.
+         */
+        xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+        if (!xfs_acl)
+                return ERR_PTR(-ENOMEM);
+        error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT);
+        if (error) {
+                /*
+                 * If the attribute doesn't exist make sure we have a negative
+                 * cache entry, for any other error assume it is transient and
+                 * leave the cache entry as ACL_NOT_CACHED.
+                 */
+                if (error == -ENOATTR) {
+                        acl = NULL;
+                        goto out_update_cache;
+                }
+                goto out;
+        }
+        acl = xfs_acl_from_disk(xfs_acl);
+        if (IS_ERR(acl))
+                goto out;
+ out_update_cache:
+        set_cached_acl(inode, type, acl);
+ out:
+        kfree(xfs_acl);
+        return acl;
+}
+STATIC int
+xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        char *ea_name;
+        int error;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                ea_name = SGI_ACL_FILE;
+                break;
+        case ACL_TYPE_DEFAULT:
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EACCES : 0;
+                ea_name = SGI_ACL_DEFAULT;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                struct xfs_acl *xfs_acl;
+                int len;
+                xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+                if (!xfs_acl)
+                        return -ENOMEM;
+                xfs_acl_to_disk(xfs_acl, acl);
+                len = sizeof(struct xfs_acl) -
+                        (sizeof(struct xfs_acl_entry) *
+                         (XFS_ACL_MAX_ENTRIES - acl->a_count));
+                error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl,
+                                len, ATTR_ROOT);
+                kfree(xfs_acl);
+        } else {
+                /*
+                 * A NULL ACL argument means we want to remove the ACL.
+                 */
+                error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+                /*
+                 * If the attribute didn't exist to start with that's fine.
+                 */
+                if (error == -ENOATTR)
+                        error = 0;
+        }
+        if (!error)
+                set_cached_acl(inode, type, acl);
+        return error;
+}
+int
+xfs_check_acl(struct inode *inode, int mask)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        struct posix_acl *acl;
+        int error = -EAGAIN;
+        xfs_itrace_entry(ip);
+        /*
+         * If there is no attribute fork no ACL exists on this inode and
+         * we can skip the whole exercise.
+         */
+        if (!XFS_IFORK_Q(ip))
+                return -EAGAIN;
+        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+        }
+        return error;
+}
+static int
+xfs_set_mode(struct inode *inode, mode_t mode)
+{
+        int error = 0;
+        if (mode != inode->i_mode) {
+                struct iattr iattr;
+                iattr.ia_valid = ATTR_MODE;
+                iattr.ia_mode = mode;
+                error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+        }
+        return error;
+}
+static int
+xfs_acl_exists(struct inode *inode, char *name)
+{
+        int len = sizeof(struct xfs_acl);
+        return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
+                            ATTR_ROOT|ATTR_KERNOVAL) == 0);
+}
+int
+posix_acl_access_exists(struct inode *inode)
+{
+        return xfs_acl_exists(inode, SGI_ACL_FILE);
+}
+int
+posix_acl_default_exists(struct inode *inode)
+{
+        if (!S_ISDIR(inode->i_mode))
+                return 0;
+        return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
+}
+/*
+ * No need for i_mutex because the inode is not yet exposed to the VFS.
+ */
+int
+xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl)
+{
+        struct posix_acl *clone;
+        mode_t mode;
+        int error = 0, inherit = 0;
+        if (S_ISDIR(inode->i_mode)) {
+                error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+                if (error)
+                        return error;
+        }
+        clone = posix_acl_clone(default_acl, GFP_KERNEL);
+        if (!clone)
+                return -ENOMEM;
+        mode = inode->i_mode;
+        error = posix_acl_create_masq(clone, &mode);
+        if (error < 0)
+                goto out_release_clone;
+        /*
+         * If posix_acl_create_masq returns a positive value we need to
+         * inherit a permission that can't be represented using the Unix
+         * mode bits and we actually need to set an ACL.
+         */
+        if (error > 0)
+                inherit = 1;
+        error = xfs_set_mode(inode, mode);
+        if (error)
+                goto out_release_clone;
+        if (inherit)
+                error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+ out_release_clone:
+        posix_acl_release(clone);
+        return error;
+}
+int
+xfs_acl_chmod(struct inode *inode)
+{
+        struct posix_acl *acl, *clone;
+        int error;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        error = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!error)
+                error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+        posix_acl_release(clone);
+        return error;
+}
+/*
+ * System xattr handlers.
+ *
+ * Currently Posix ACLs are the only system namespace extended attribute
+ * handlers supported by XFS, so we just implement the handlers here.
+ * If we ever support other system extended attributes this will need
+ * some refactoring.
+ */
+static int
+xfs_decode_acl(const char *name)
+{
+        if (strcmp(name, "posix_acl_access") == 0)
+                return ACL_TYPE_ACCESS;
+        else if (strcmp(name, "posix_acl_default") == 0)
+                return ACL_TYPE_DEFAULT;
+        return -EINVAL;
+}
+static int
+xfs_xattr_system_get(struct inode *inode, const char *name,
+                void *value, size_t size)
+{
+        struct posix_acl *acl;
+        int type, error;
+        type = xfs_decode_acl(name);
+        if (type < 0)
+                return type;
+        acl = xfs_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, value, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int
+xfs_xattr_system_set(struct inode *inode, const char *name,
+                const void *value, size_t size, int flags)
+{
+        struct posix_acl *acl = NULL;
+        int error = 0, type;
+        type = xfs_decode_acl(name);
+        if (type < 0)
+                return type;
+        if (flags & XATTR_CREATE)
+                return -EINVAL;
+        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+                return value ? -EACCES : 0;
+        if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (!value)
+                goto set_acl;
+        acl = posix_acl_from_xattr(value, size);
+        if (!acl) {
+                /*
+                 * acl_set_file(3) may request that we set default ACLs with
+                 * zero length -- defend (gracefully) against that here.
+                 */
+                goto out;
+        }
+        if (IS_ERR(acl)) {
+                error = PTR_ERR(acl);
+                goto out;
+        }
+        error = posix_acl_valid(acl);
+        if (error)
+                goto out_release;
+        error = -EINVAL;
+        if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+                goto out_release;
+        if (type == ACL_TYPE_ACCESS) {
+                mode_t mode = inode->i_mode;
+                error = posix_acl_equiv_mode(acl, &mode);
+                if (error <= 0) {
+                        posix_acl_release(acl);
+                        acl = NULL;
+                        if (error < 0)
+                                return error;
+                }
+                error = xfs_set_mode(inode, mode);
+                if (error)
+                        goto out_release;
+        }
+ set_acl:
+        error = xfs_set_acl(inode, type, acl);
+ out_release:
+        posix_acl_release(acl);
+ out:
+        return error;
+}
+struct xattr_handler xfs_xattr_system_handler = {
+        .prefix = XATTR_SYSTEM_PREFIX,
+        .get    = xfs_xattr_system_get,
+        .set    = xfs_xattr_system_set,
+};
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7ec89fc05b2b..aecf2519db76 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1268,6 +1268,14 @@ xfs_vm_writepage(
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+        /*
+         *  VM calculation for nr_to_write seems off.  Bump it way
+         *  up, this gets simple streaming writes zippy again.
+         *  To be reviewed again after Jens' writeback changes.
+         */
+        wbc->nr_to_write *= 4;
        /*
         * Convert delayed allocate, unwritten or unmapped space
         * to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a9f2b5..965df1227d64 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -412,7 +412,7 @@ _xfs_buf_lookup_pages(
                        XFS_STATS_INC(xb_page_retries);
                        xfsbufd_wakeup(0, gfp_mask);
-                        congestion_wait(WRITE, HZ/50);
+                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -770,7 +770,7 @@ xfs_buf_associate_memory(
        bp->b_pages = NULL;
        bp->b_addr = mem;
-        rval = _xfs_buf_get_pages(bp, page_count, 0);
+        rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
        if (rval)
                return rval;
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
        struct block_device     *bdev)
 {
        return xfs_setsize_buftarg_flags(btp,
-                        PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+                        PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 int
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index f4e255441574..0542fd507649 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -41,7 +41,6 @@
 #include "xfs_ioctl.h"
 #include <linux/dcache.h>
-#include <linux/smp_lock.h>
 static struct vm_operations_struct xfs_file_vm_ops;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 34eaab608e6e..5bb523d7f37e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -41,7 +41,6 @@
 #include "xfs_itable.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
@@ -899,7 +898,8 @@ xfs_ioctl_setattr(
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
        unsigned int            lock_flags = 0;
-        struct xfs_dquot        *udqp = NULL, *gdqp = NULL;
+        struct xfs_dquot        *udqp = NULL;
+        struct xfs_dquot        *gdqp = NULL;
        struct xfs_dquot        *olddquot = NULL;
        int                     code;
@@ -919,7 +919,7 @@ xfs_ioctl_setattr(
         * because the i_*dquot fields will get updated anyway.
         */
        if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
-                code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid,
+                code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
                                         ip->i_d.di_gid, fa->fsx_projid,
                                         XFS_QMOPT_PQUOTA, &udqp, &gdqp);
                if (code)
@@ -954,10 +954,11 @@ xfs_ioctl_setattr(
         * Do a quota reservation only if projid is actually going to change.
         */
        if (mask & FSX_PROJID) {
-                if (XFS_IS_PQUOTA_ON(mp) &&
+                if (XFS_IS_QUOTA_RUNNING(mp) &&
+                    XFS_IS_PQUOTA_ON(mp) &&
                    ip->i_d.di_projid != fa->fsx_projid) {
                        ASSERT(tp);
-                        code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
                                                capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (code)       /* out of quota */
@@ -1059,8 +1060,8 @@ xfs_ioctl_setattr(
                 * in the transaction.
                 */
                if (ip->i_d.di_projid != fa->fsx_projid) {
-                        if (XFS_IS_PQUOTA_ON(mp)) {
+                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
-                                olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+                                olddquot = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_gdquot, gdqp);
                        }
                        ip->i_d.di_projid = fa->fsx_projid;
@@ -1106,9 +1107,9 @@ xfs_ioctl_setattr(
        /*
         * Release any dquot(s) the inode had kept before chown.
         */
-        XFS_QM_DQRELE(mp, olddquot);
+        xfs_qm_dqrele(olddquot);
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        if (code)
                return code;
@@ -1122,8 +1123,8 @@ xfs_ioctl_setattr(
        return 0;
 error_return:
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        xfs_trans_cancel(tp, 0);
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 6075382336d7..8070b34cc287 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -17,6 +17,7 @@
 */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_acl.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -51,6 +52,7 @@
 #include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/namei.h>
+#include <linux/posix_acl.h>
 #include <linux/security.h>
 #include <linux/falloc.h>
 #include <linux/fiemap.h>
@@ -202,9 +204,8 @@ xfs_vn_mknod(
 {
        struct inode    *inode;
        struct xfs_inode *ip = NULL;
-        xfs_acl_t       *default_acl = NULL;
+        struct posix_acl *default_acl = NULL;
        struct xfs_name name;
-        int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS;
        int             error;
        /*
@@ -219,18 +220,14 @@ xfs_vn_mknod(
                rdev = 0;
        }
-        if (test_default_acl && test_default_acl(dir)) {
+        if (IS_POSIXACL(dir)) {
-                if (!_ACL_ALLOC(default_acl)) {
+                default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
-                        return -ENOMEM;
+                if (IS_ERR(default_acl))
-                }
+                        return -PTR_ERR(default_acl);
-                if (!_ACL_GET_DEFAULT(dir, default_acl)) {
-                        _ACL_FREE(default_acl);
-                        default_acl = NULL;
-                }
-        }
-        if (IS_POSIXACL(dir) && !default_acl)
+                if (!default_acl)
-                mode &= ~current_umask();
+                        mode &= ~current_umask();
+        }
        xfs_dentry_to_name(&name, dentry);
        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
@@ -244,10 +241,10 @@ xfs_vn_mknod(
                goto out_cleanup_inode;
        if (default_acl) {
-                error = _ACL_INHERIT(inode, mode, default_acl);
+                error = -xfs_inherit_acl(inode, default_acl);
                if (unlikely(error))
                        goto out_cleanup_inode;
-                _ACL_FREE(default_acl);
+                posix_acl_release(default_acl);
        }
@@ -257,8 +254,7 @@ xfs_vn_mknod(
 out_cleanup_inode:
        xfs_cleanup_inode(dir, inode, dentry);
 out_free_acl:
-        if (default_acl)
+        posix_acl_release(default_acl);
-                _ACL_FREE(default_acl);
        return -error;
 }
@@ -488,26 +484,6 @@ xfs_vn_put_link(
                kfree(s);
 }
-#ifdef CONFIG_XFS_POSIX_ACL
-STATIC int
-xfs_check_acl(
-        struct inode            *inode,
-        int                     mask)
-{
-        struct xfs_inode        *ip = XFS_I(inode);
-        int                     error;
-        xfs_itrace_entry(ip);
-        if (XFS_IFORK_Q(ip)) {
-                error = xfs_acl_iaccess(ip, mask, NULL);
-                if (error != -1)
-                        return -error;
-        }
-        return -EAGAIN;
-}
 STATIC int
 xfs_vn_permission(
        struct inode            *inode,
@@ -515,9 +491,6 @@ xfs_vn_permission(
 {
        return generic_permission(inode, mask, xfs_check_acl);
 }
-#else
-#define xfs_vn_permission NULL
-#endif
 STATIC int
 xfs_vn_getattr(
@@ -707,8 +680,8 @@ xfs_vn_fiemap(
        else
                bm.bmv_length = BTOBB(length);
-        /* our formatter will tell xfs_getbmap when to stop. */
+        /* We add one because in getbmap world count includes the header */
-        bm.bmv_count = MAXEXTNUM;
+        bm.bmv_count = fieinfo->fi_extents_max + 1;
        bm.bmv_iflags = BMV_IF_PREALLOC;
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
                bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index f65a53f8752f..6127e24062d0 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -24,7 +24,7 @@
 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
 * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
 */
-#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
 # define XFS_BIG_BLKNOS 1
 # define XFS_BIG_INUMS  1
 #else
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 9142192ccbe6..7078974a6eee 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_inode_item.h"
 #include "xfs_buf_item.h"
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 94d9a633d3d9..cb6e2cca214f 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -50,9 +50,11 @@ xfs_fs_quota_sync(
 {
        struct xfs_mount        *mp = XFS_M(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
        if (!XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
-        return -xfs_sync_inodes(mp, SYNC_DELWRI);
+        return -xfs_sync_data(mp, 0);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bb685269f832..a220d36f789b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -43,7 +43,6 @@
 #include "xfs_itable.h"
 #include "xfs_fsops.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -405,6 +404,14 @@ xfs_parseargs(
                return EINVAL;
        }
+#ifndef CONFIG_XFS_QUOTA
+        if (XFS_IS_QUOTA_RUNNING(mp)) {
+                cmn_err(CE_WARN,
+                        "XFS: quota support not available in this kernel.");
+                return EINVAL;
+        }
+#endif
        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
            (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
                cmn_err(CE_WARN,
@@ -609,7 +616,7 @@ xfs_max_file_offset(
         */
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
        ASSERT(sizeof(sector_t) == 8);
        pagefactor = PAGE_CACHE_SIZE;
        bitshift = BITS_PER_LONG;
@@ -1063,7 +1070,18 @@ xfs_fs_put_super(
        int                     unmount_event_flags = 0;
        xfs_syncd_stop(mp);
-        xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                /*
+                 * XXX(hch): this should be SYNC_WAIT.
+                 *
+                 * Or more likely not needed at all because the VFS is already
+                 * calling ->sync_fs after shutting down all filestem
+                 * operations and just before calling ->put_super.
+                 */
+                xfs_sync_data(mp, 0);
+                xfs_sync_attr(mp, 0);
+        }
 #ifdef HAVE_DMAPI
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1098,21 +1116,11 @@ xfs_fs_put_super(
        xfs_freesb(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
-        xfs_qmops_put(mp);
        xfs_dmops_put(mp);
        xfs_free_fsname(mp);
        kfree(mp);
 }
-STATIC void
-xfs_fs_write_super(
-        struct super_block      *sb)
-{
-        if (!(sb->s_flags & MS_RDONLY))
-                xfs_sync_fsdata(XFS_M(sb), 0);
-        sb->s_dirt = 0;
-}
 STATIC int
 xfs_fs_sync_super(
        struct super_block      *sb,
@@ -1137,7 +1145,6 @@ xfs_fs_sync_super(
                error = xfs_quiesce_data(mp);
        else
                error = xfs_sync_fsdata(mp, 0);
-        sb->s_dirt = 0;
        if (unlikely(laptop_mode)) {
                int     prev_sync_seq = mp->m_sync_seq;
@@ -1168,6 +1175,7 @@ xfs_fs_statfs(
 {
        struct xfs_mount        *mp = XFS_M(dentry->d_sb);
        xfs_sb_t                *sbp = &mp->m_sb;
+        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
        __uint64_t              fakeinos, id;
        xfs_extlen_t            lsize;
@@ -1196,7 +1204,10 @@ xfs_fs_statfs(
        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
        spin_unlock(&mp->m_sb_lock);
-        XFS_QM_DQSTATVFS(XFS_I(dentry->d_inode), statp);
+        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
+            ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
+                              (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+                xfs_qm_statvfs(ip, statp);
        return 0;
 }
@@ -1404,16 +1415,13 @@ xfs_fs_fill_super(
        error = xfs_dmops_get(mp);
        if (error)
                goto out_free_fsname;
-        error = xfs_qmops_get(mp);
-        if (error)
-                goto out_put_dmops;
        if (silent)
                flags |= XFS_MFSI_QUIET;
        error = xfs_open_devices(mp);
        if (error)
-                goto out_put_qmops;
+                goto out_put_dmops;
        if (xfs_icsb_init_counters(mp))
                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
@@ -1443,7 +1451,6 @@ xfs_fs_fill_super(
        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
-        sb->s_dirt = 1;
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1482,8 +1489,6 @@ xfs_fs_fill_super(
 out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
- out_put_qmops:
-        xfs_qmops_put(mp);
 out_put_dmops:
        xfs_dmops_put(mp);
 out_free_fsname:
@@ -1533,7 +1538,6 @@ static struct super_operations xfs_super_operations = {
        .write_inode            = xfs_fs_write_inode,
        .clear_inode            = xfs_fs_clear_inode,
        .put_super              = xfs_fs_put_super,
-        .write_super            = xfs_fs_write_super,
        .sync_fs                = xfs_fs_sync_super,
        .freeze_fs              = xfs_fs_freeze,
        .statfs                 = xfs_fs_statfs,
@@ -1718,18 +1722,8 @@ xfs_init_zones(void)
        if (!xfs_ili_zone)
                goto out_destroy_inode_zone;
-#ifdef CONFIG_XFS_POSIX_ACL
-        xfs_acl_zone = kmem_zone_init(sizeof(xfs_acl_t), "xfs_acl");
-        if (!xfs_acl_zone)
-                goto out_destroy_ili_zone;
-#endif
        return 0;
-#ifdef CONFIG_XFS_POSIX_ACL
- out_destroy_ili_zone:
-#endif
-        kmem_zone_destroy(xfs_ili_zone);
 out_destroy_inode_zone:
        kmem_zone_destroy(xfs_inode_zone);
 out_destroy_efi_zone:
@@ -1763,9 +1757,6 @@ xfs_init_zones(void)
 STATIC void
 xfs_destroy_zones(void)
 {
-#ifdef CONFIG_XFS_POSIX_ACL
-        kmem_zone_destroy(xfs_acl_zone);
-#endif
        kmem_zone_destroy(xfs_ili_zone);
        kmem_zone_destroy(xfs_inode_zone);
        kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index f7ba76633c29..98ef624d9baf 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -43,166 +43,267 @@
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_rw.h"
+#include "xfs_quota.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-/*
- * Sync all the inodes in the given AG according to the
- * direction given by the flags.
- */
-STATIC int
-xfs_sync_inodes_ag(
-        xfs_mount_t     *mp,
-        int             ag,
-        int             flags)
-{
-        xfs_perag_t     *pag = &mp->m_perag[ag];
-        int             nr_found;
-        uint32_t        first_index = 0;
-        int             error = 0;
-        int             last_error = 0;
-        do {
+STATIC xfs_inode_t *
-                struct inode    *inode;
+xfs_inode_ag_lookup(
-                xfs_inode_t     *ip = NULL;
+        struct xfs_mount        *mp,
-                int             lock_flags = XFS_ILOCK_SHARED;
+        struct xfs_perag        *pag,
+        uint32_t                *first_index,
+        int                     tag)
+{
+        int                     nr_found;
+        struct xfs_inode        *ip;
-                /*
+        /*
-                 * use a gang lookup to find the next inode in the tree
+         * use a gang lookup to find the next inode in the tree
-                 * as the tree is sparse and a gang lookup walks to find
+         * as the tree is sparse and a gang lookup walks to find
-                 * the number of objects requested.
+         * the number of objects requested.
-                 */
+         */
-                read_lock(&pag->pag_ici_lock);
+        read_lock(&pag->pag_ici_lock);
+        if (tag == XFS_ICI_NO_TAG) {
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-                                (void**)&ip, first_index, 1);
+                                (void **)&ip, *first_index, 1);
+        } else {
+                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+                                (void **)&ip, *first_index, 1, tag);
+        }
+        if (!nr_found)
+                goto unlock;
-                if (!nr_found) {
+        /*
-                        read_unlock(&pag->pag_ici_lock);
+         * Update the index for the next lookup. Catch overflows
-                        break;
+         * into the next AG range which can occur if we have inodes
-                }
+         * in the last block of the AG and we are currently
+         * pointing to the last inode.
+         */
+        *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+        if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                goto unlock;
-                /*
+        return ip;
-                 * Update the index for the next lookup. Catch overflows
-                 * into the next AG range which can occur if we have inodes
-                 * in the last block of the AG and we are currently
-                 * pointing to the last inode.
-                 */
-                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /* nothing to sync during shutdown */
+unlock:
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+        read_unlock(&pag->pag_ici_lock);
-                        read_unlock(&pag->pag_ici_lock);
+        return NULL;
-                        return 0;
+}
-                }
-                /*
+STATIC int
-                 * If we can't get a reference on the inode, it must be
+xfs_inode_ag_walk(
-                 * in reclaim. Leave it for the reclaim code to flush.
+        struct xfs_mount        *mp,
-                 */
+        xfs_agnumber_t          ag,
-                inode = VFS_I(ip);
+        int                     (*execute)(struct xfs_inode *ip,
-                if (!igrab(inode)) {
+                                           struct xfs_perag *pag, int flags),
-                        read_unlock(&pag->pag_ici_lock);
+        int                     flags,
-                        continue;
+        int                     tag)
-                }
+{
-                read_unlock(&pag->pag_ici_lock);
+        struct xfs_perag        *pag = &mp->m_perag[ag];
+        uint32_t                first_index;
+        int                     last_error = 0;
+        int                     skipped;
-                /* avoid new or bad inodes */
+restart:
-                if (is_bad_inode(inode) ||
+        skipped = 0;
-                    xfs_iflags_test(ip, XFS_INEW)) {
+        first_index = 0;
-                        IRELE(ip);
+        do {
-                        continue;
+                int             error = 0;
-                }
+                xfs_inode_t     *ip;
-                /*
+                ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
-                 * If we have to flush data or wait for I/O completion
+                if (!ip)
-                 * we need to hold the iolock.
+                        break;
-                 */
-                if (flags & SYNC_DELWRI) {
-                        if (VN_DIRTY(inode)) {
-                                if (flags & SYNC_TRYLOCK) {
-                                        if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-                                                lock_flags |= XFS_IOLOCK_SHARED;
-                                } else {
-                                        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-                                        lock_flags |= XFS_IOLOCK_SHARED;
-                                }
-                                if (lock_flags & XFS_IOLOCK_SHARED) {
-                                        error = xfs_flush_pages(ip, 0, -1,
-                                                        (flags & SYNC_WAIT) ? 0
-                                                                : XFS_B_ASYNC,
-                                                        FI_NONE);
-                                }
-                        }
-                        if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
-                                xfs_ioend_wait(ip);
-                }
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
-                if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
-                        if (flags & SYNC_WAIT) {
-                                xfs_iflock(ip);
-                                if (!xfs_inode_clean(ip))
-                                        error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-                                else
-                                        xfs_ifunlock(ip);
-                        } else if (xfs_iflock_nowait(ip)) {
-                                if (!xfs_inode_clean(ip))
-                                        error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
-                                else
-                                        xfs_ifunlock(ip);
-                        }
-                }
-                xfs_iput(ip, lock_flags);
+                error = execute(ip, pag, flags);
+                if (error == EAGAIN) {
+                        skipped++;
+                        continue;
+                }
                if (error)
                        last_error = error;
                /*
                 * bail out if the filesystem is corrupted.
                 */
                if (error == EFSCORRUPTED)
-                        return XFS_ERROR(error);
+                        break;
-        } while (nr_found);
+        } while (1);
+        if (skipped) {
+                delay(1);
+                goto restart;
+        }
+        xfs_put_perag(mp, pag);
        return last_error;
 }
 int
-xfs_sync_inodes(
+xfs_inode_ag_iterator(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             flags)
+        int                     (*execute)(struct xfs_inode *ip,
+                                           struct xfs_perag *pag, int flags),
+        int                     flags,
+        int                     tag)
 {
-        int             error;
+        int                     error = 0;
-        int             last_error;
+        int                     last_error = 0;
-        int             i;
+        xfs_agnumber_t          ag;
-        int             lflags = XFS_LOG_FORCE;
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
+        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-                return 0;
+                if (!mp->m_perag[ag].pag_ici_init)
-        error = 0;
+                        continue;
-        last_error = 0;
+                error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+                if (error) {
+                        last_error = error;
+                        if (error == EFSCORRUPTED)
+                                break;
+                }
+        }
+        return XFS_ERROR(last_error);
+}
+/* must be called with pag_ici_lock held and releases it */
+int
+xfs_sync_inode_valid(
+        struct xfs_inode        *ip,
+        struct xfs_perag        *pag)
+{
+        struct inode            *inode = VFS_I(ip);
+        /* nothing to sync during shutdown */
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                read_unlock(&pag->pag_ici_lock);
+                return EFSCORRUPTED;
+        }
+        /*
+         * If we can't get a reference on the inode, it must be in reclaim.
+         * Leave it for the reclaim code to flush. Also avoid inodes that
+         * haven't been fully initialised.
+         */
+        if (!igrab(inode)) {
+                read_unlock(&pag->pag_ici_lock);
+                return ENOENT;
+        }
+        read_unlock(&pag->pag_ici_lock);
+        if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+                IRELE(ip);
+                return ENOENT;
+        }
+        return 0;
+}
+STATIC int
+xfs_sync_inode_data(
+        struct xfs_inode        *ip,
+        struct xfs_perag        *pag,
+        int                     flags)
+{
+        struct inode            *inode = VFS_I(ip);
+        struct address_space *mapping = inode->i_mapping;
+        int                     error = 0;
+        error = xfs_sync_inode_valid(ip, pag);
+        if (error)
+                return error;
+        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+                goto out_wait;
+        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+                if (flags & SYNC_TRYLOCK)
+                        goto out_wait;
+                xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        }
+        error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
+                                0 : XFS_B_ASYNC, FI_NONE);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ out_wait:
        if (flags & SYNC_WAIT)
-                lflags |= XFS_LOG_SYNC;
+                xfs_ioend_wait(ip);
+        IRELE(ip);
+        return error;
+}
-        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+STATIC int
-                if (!mp->m_perag[i].pag_ici_init)
+xfs_sync_inode_attr(
-                        continue;
+        struct xfs_inode        *ip,
-                error = xfs_sync_inodes_ag(mp, i, flags);
+        struct xfs_perag        *pag,
-                if (error)
+        int                     flags)
-                        last_error = error;
+{
-                if (error == EFSCORRUPTED)
+        int                     error = 0;
-                        break;
+        error = xfs_sync_inode_valid(ip, pag);
+        if (error)
+                return error;
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        if (xfs_inode_clean(ip))
+                goto out_unlock;
+        if (!xfs_iflock_nowait(ip)) {
+                if (!(flags & SYNC_WAIT))
+                        goto out_unlock;
+                xfs_iflock(ip);
        }
-        if (flags & SYNC_DELWRI)
-                xfs_log_force(mp, 0, lflags);
-        return XFS_ERROR(last_error);
+        if (xfs_inode_clean(ip)) {
+                xfs_ifunlock(ip);
+                goto out_unlock;
+        }
+        error = xfs_iflush(ip, (flags & SYNC_WAIT) ?
+                           XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
+ out_unlock:
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        IRELE(ip);
+        return error;
+}
+/*
+ * Write out pagecache data for the whole filesystem.
+ */
+int
+xfs_sync_data(
+        struct xfs_mount        *mp,
+        int                     flags)
+{
+        int                     error;
+        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
+                                      XFS_ICI_NO_TAG);
+        if (error)
+                return XFS_ERROR(error);
+        xfs_log_force(mp, 0,
+                      (flags & SYNC_WAIT) ?
+                       XFS_LOG_FORCE | XFS_LOG_SYNC :
+                       XFS_LOG_FORCE);
+        return 0;
+}
+/*
+ * Write out inode metadata (attributes) for the whole filesystem.
+ */
+int
+xfs_sync_attr(
+        struct xfs_mount        *mp,
+        int                     flags)
+{
+        ASSERT((flags & ~SYNC_WAIT) == 0);
+        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
+                                     XFS_ICI_NO_TAG);
 }
 STATIC int
@@ -252,7 +353,7 @@ xfs_sync_fsdata(
         * If this is xfssyncd() then only sync the superblock if we can
         * lock it without sleeping and it is not pinned.
         */
-        if (flags & SYNC_BDFLUSH) {
+        if (flags & SYNC_TRYLOCK) {
                ASSERT(!(flags & SYNC_WAIT));
                bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
@@ -316,13 +417,13 @@ xfs_quiesce_data(
        int error;
        /* push non-blocking */
-        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+        xfs_sync_data(mp, 0);
-        XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+        xfs_qm_sync(mp, SYNC_TRYLOCK);
        xfs_filestream_flush(mp);
        /* push and block */
-        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+        xfs_sync_data(mp, SYNC_WAIT);
-        XFS_QM_DQSYNC(mp, SYNC_WAIT);
+        xfs_qm_sync(mp, SYNC_WAIT);
        /* write superblock and hoover up shutdown errors */
        error = xfs_sync_fsdata(mp, 0);
@@ -341,7 +442,7 @@ xfs_quiesce_fs(
        int     count = 0, pincount;
        xfs_flush_buftarg(mp->m_ddev_targp, 0);
-        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+        xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
        /*
         * This loop must run at least twice.  The first instance of the loop
@@ -350,7 +451,7 @@ xfs_quiesce_fs(
         * logged before we can write the unmount record.
         */
        do {
-                xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+                xfs_sync_attr(mp, SYNC_WAIT);
                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
                if (!pincount) {
                        delay(50);
@@ -433,8 +534,8 @@ xfs_flush_inodes_work(
        void            *arg)
 {
        struct inode    *inode = arg;
-        xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
+        xfs_sync_data(mp, SYNC_TRYLOCK);
-        xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
+        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
        iput(inode);
 }
@@ -465,10 +566,10 @@ xfs_sync_worker(
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-                xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
                /* dgc: errors ignored here */
-                error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-                error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+                error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
                if (xfs_log_need_covered(mp))
                        error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
        }
@@ -569,7 +670,7 @@ xfs_reclaim_inode(
                        xfs_ifunlock(ip);
                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
                }
-                return 1;
+                return -EAGAIN;
        }
        __xfs_iflags_set(ip, XFS_IRECLAIM);
        spin_unlock(&ip->i_flags_lock);
@@ -607,6 +708,16 @@ xfs_reclaim_inode(
        return 0;
 }
+void
+__xfs_inode_set_reclaim_tag(
+        struct xfs_perag        *pag,
+        struct xfs_inode        *ip)
+{
+        radix_tree_tag_set(&pag->pag_ici_root,
+                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                           XFS_ICI_RECLAIM_TAG);
+}
 /*
 * We set the inode flag atomically with the radix tree tag.
 * Once we get tag lookups on the radix tree, this inode flag
@@ -621,8 +732,7 @@ xfs_inode_set_reclaim_tag(
        read_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
-        radix_tree_tag_set(&pag->pag_ici_root,
+        __xfs_inode_set_reclaim_tag(pag, ip);
-                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
        read_unlock(&pag->pag_ici_lock);
@@ -654,101 +764,27 @@ xfs_inode_clear_reclaim_tag(
        xfs_put_perag(mp, pag);
 }
+STATIC int
-STATIC void
+xfs_reclaim_inode_now(
-xfs_reclaim_inodes_ag(
+        struct xfs_inode        *ip,
-        xfs_mount_t     *mp,
+        struct xfs_perag        *pag,
-        int             ag,
+        int                     flags)
-        int             noblock,
-        int             mode)
 {
-        xfs_inode_t     *ip = NULL;
+        /* ignore if already under reclaim */
-        xfs_perag_t     *pag = &mp->m_perag[ag];
+        if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-        int             nr_found;
-        uint32_t        first_index;
-        int             skipped;
-restart:
-        first_index = 0;
-        skipped = 0;
-        do {
-                /*
-                 * use a gang lookup to find the next inode in the tree
-                 * as the tree is sparse and a gang lookup walks to find
-                 * the number of objects requested.
-                 */
-                read_lock(&pag->pag_ici_lock);
-                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
-                                        (void**)&ip, first_index, 1,
-                                        XFS_ICI_RECLAIM_TAG);
-                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /*
-                 * Update the index for the next lookup. Catch overflows
-                 * into the next AG range which can occur if we have inodes
-                 * in the last block of the AG and we are currently
-                 * pointing to the last inode.
-                 */
-                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /* ignore if already under reclaim */
-                if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                        read_unlock(&pag->pag_ici_lock);
-                        continue;
-                }
-                if (noblock) {
-                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                continue;
-                        }
-                        if (xfs_ipincount(ip) ||
-                            !xfs_iflock_nowait(ip)) {
-                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                read_unlock(&pag->pag_ici_lock);
-                                continue;
-                        }
-                }
                read_unlock(&pag->pag_ici_lock);
+                return 0;
-                /*
-                 * hmmm - this is an inode already in reclaim. Do
-                 * we even bother catching it here?
-                 */
-                if (xfs_reclaim_inode(ip, noblock, mode))
-                        skipped++;
-        } while (nr_found);
-        if (skipped) {
-                delay(1);
-                goto restart;
        }
-        return;
+        read_unlock(&pag->pag_ici_lock);
+        return xfs_reclaim_inode(ip, 0, flags);
 }
 int
 xfs_reclaim_inodes(
        xfs_mount_t     *mp,
-        int              noblock,
        int             mode)
 {
-        int             i;
+        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
+                                        XFS_ICI_RECLAIM_TAG);
-        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
-                if (!mp->m_perag[i].pag_ici_init)
-                        continue;
-                xfs_reclaim_inodes_ag(mp, i, noblock, mode);
-        }
-        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 308d5bf6dfbd..59120602588a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -29,17 +29,14 @@ typedef struct xfs_sync_work {
        struct completion       *w_completion;
 } xfs_sync_work_t;
-#define SYNC_ATTR               0x0001  /* sync attributes */
+#define SYNC_WAIT               0x0001  /* wait for i/o to complete */
-#define SYNC_DELWRI             0x0002  /* look at delayed writes */
+#define SYNC_TRYLOCK            0x0002  /* only try to lock inodes */
-#define SYNC_WAIT               0x0004  /* wait for i/o to complete */
-#define SYNC_BDFLUSH            0x0008  /* BDFLUSH is calling -- don't block */
-#define SYNC_IOWAIT             0x0010  /* wait for all I/O to complete */
-#define SYNC_TRYLOCK            0x0020  /* only try to lock inodes */
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
-int xfs_sync_inodes(struct xfs_mount *mp, int flags);
+int xfs_sync_attr(struct xfs_mount *mp, int flags);
+int xfs_sync_data(struct xfs_mount *mp, int flags);
 int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
@@ -48,10 +45,17 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
 void xfs_flush_inodes(struct xfs_inode *ip);
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
-int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
 void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
                                struct xfs_inode *ip);
+int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+        int flags, int tag);
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 964621fde6ed..497c7fb75cc1 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -29,67 +29,6 @@
 #include <linux/xattr.h>
-/*
- * ACL handling.  Should eventually be moved into xfs_acl.c
- */
-static int
-xfs_decode_acl(const char *name)
-{
-        if (strcmp(name, "posix_acl_access") == 0)
-                return _ACL_TYPE_ACCESS;
-        else if (strcmp(name, "posix_acl_default") == 0)
-                return _ACL_TYPE_DEFAULT;
-        return -EINVAL;
-}
-/*
- * Get system extended attributes which at the moment only
- * includes Posix ACLs.
- */
-static int
-xfs_xattr_system_get(struct inode *inode, const char *name,
-                void *buffer, size_t size)
-{
-        int acl;
-        acl = xfs_decode_acl(name);
-        if (acl < 0)
-                return acl;
-        return xfs_acl_vget(inode, buffer, size, acl);
-}
-static int
-xfs_xattr_system_set(struct inode *inode, const char *name,
-                const void *value, size_t size, int flags)
-{
-        int acl;
-        acl = xfs_decode_acl(name);
-        if (acl < 0)
-                return acl;
-        if (flags & XATTR_CREATE)
-                return -EINVAL;
-        if (!value)
-                return xfs_acl_vremove(inode, acl);
-        return xfs_acl_vset(inode, (void *)value, size, acl);
-}
-static struct xattr_handler xfs_xattr_system_handler = {
-        .prefix = XATTR_SYSTEM_PREFIX,
-        .get    = xfs_xattr_system_get,
-        .set    = xfs_xattr_system_set,
-};
-/*
- * Real xattr handling.  The only difference between the namespaces is
- * a flag passed to the low-level attr code.
- */
 static int
 __xfs_xattr_get(struct inode *inode, const char *name,
                void *value, size_t size, int xflags)
@@ -199,7 +138,9 @@ struct xattr_handler *xfs_xattr_handlers[] = {
        &xfs_xattr_user_handler,
        &xfs_xattr_trusted_handler,
        &xfs_xattr_security_handler,
+#ifdef CONFIG_XFS_POSIX_ACL
        &xfs_xattr_system_handler,
+#endif
        NULL
 };
@@ -310,7 +251,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
        /*
         * Then add the two synthetic ACL attributes.
         */
-        if (xfs_acl_vhasacl_access(inode)) {
+        if (posix_acl_access_exists(inode)) {
                error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
                                strlen(POSIX_ACL_XATTR_ACCESS) + 1,
                                data, size, &context.count);
@@ -318,7 +259,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
                        return error;
        }
-        if (xfs_acl_vhasacl_default(inode)) {
+        if (posix_acl_default_exists(inode)) {
                error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
                                strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
                                data, size, &context.count);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e4babcc63423..2f3f2229eaaf 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -1194,7 +1193,9 @@ void
 xfs_qm_dqrele(
        xfs_dquot_t     *dqp)
 {
-        ASSERT(dqp);
+        if (!dqp)
+                return;
        xfs_dqtrace_entry(dqp, "DQRELE");
        xfs_dqlock(dqp);
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index de0f402ddb4c..6533ead9b889 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -181,7 +181,6 @@ extern void		xfs_qm_adjust_dqlimits(xfs_mount_t *,
 extern int              xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
                                        xfs_dqid_t, uint, uint, xfs_dquot_t **);
 extern void             xfs_qm_dqput(xfs_dquot_t *);
-extern void             xfs_qm_dqrele(xfs_dquot_t *);
 extern void             xfs_dqlock(xfs_dquot_t *);
 extern void             xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
 extern void             xfs_dqunlock(xfs_dquot_t *);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1728f6a7c4f5..d0d4a9a0bbd7 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 5b6695049e00..45b1bfef7388 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -287,11 +286,13 @@ xfs_qm_rele_quotafs_ref(
 * Just destroy the quotainfo structure.
 */
 void
-xfs_qm_unmount_quotadestroy(
+xfs_qm_unmount(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        if (mp->m_quotainfo)
+        if (mp->m_quotainfo) {
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
                xfs_qm_destroy_quotainfo(mp);
+        }
 }
@@ -385,8 +386,13 @@ xfs_qm_mount_quotas(
        if (error) {
                xfs_fs_cmn_err(CE_WARN, mp,
                        "Failed to initialize disk quotas.");
+                return;
        }
-        return;
+#ifdef QUOTADEBUG
+        if (XFS_IS_QUOTA_ON(mp))
+                xfs_qm_internalqcheck(mp);
+#endif
 }
 /*
@@ -774,12 +780,11 @@ xfs_qm_dqattach_grouphint(
 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
 * into account.
 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
 * Inode may get unlocked and relocked in here, and the caller must deal with
 * the consequences.
 */
 int
-xfs_qm_dqattach(
+xfs_qm_dqattach_locked(
        xfs_inode_t     *ip,
        uint            flags)
 {
@@ -787,17 +792,14 @@ xfs_qm_dqattach(
        uint            nquotas = 0;
        int             error = 0;
-        if ((! XFS_IS_QUOTA_ON(mp)) ||
+        if (!XFS_IS_QUOTA_RUNNING(mp) ||
-            (! XFS_NOT_DQATTACHED(mp, ip)) ||
+            !XFS_IS_QUOTA_ON(mp) ||
-            (ip->i_ino == mp->m_sb.sb_uquotino) ||
+            !XFS_NOT_DQATTACHED(mp, ip) ||
-            (ip->i_ino == mp->m_sb.sb_gquotino))
+            ip->i_ino == mp->m_sb.sb_uquotino ||
+            ip->i_ino == mp->m_sb.sb_gquotino)
                return 0;
-        ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-               xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        if (! (flags & XFS_QMOPT_ILOCKED))
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (XFS_IS_UQUOTA_ON(mp)) {
                error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
@@ -849,8 +851,7 @@ xfs_qm_dqattach(
                xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
        }
-      done:
+ done:
 #ifdef QUOTADEBUG
        if (! error) {
                if (XFS_IS_UQUOTA_ON(mp))
@@ -858,15 +859,22 @@ xfs_qm_dqattach(
                if (XFS_IS_OQUOTA_ON(mp))
                        ASSERT(ip->i_gdquot);
        }
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 #endif
+        return error;
+}
-        if (! (flags & XFS_QMOPT_ILOCKED))
+int
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+xfs_qm_dqattach(
+        struct xfs_inode        *ip,
+        uint                    flags)
+{
+        int                     error;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        error = xfs_qm_dqattach_locked(ip, flags);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-#ifdef QUOTADEBUG
-        else
-                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-#endif
        return error;
 }
@@ -896,11 +904,6 @@ xfs_qm_dqdetach(
        }
 }
-/*
- * This is called to sync quotas. We can be told to use non-blocking
- * semantics by either the SYNC_BDFLUSH flag or the absence of the
- * SYNC_WAIT flag.
- */
 int
 xfs_qm_sync(
        xfs_mount_t     *mp,
@@ -909,17 +912,13 @@ xfs_qm_sync(
        int             recl, restarts;
        xfs_dquot_t     *dqp;
        uint            flush_flags;
-        boolean_t       nowait;
        int             error;
-        if (! XFS_IS_QUOTA_ON(mp))
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
+        flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
        restarts = 0;
-        /*
-         * We won't block unless we are asked to.
-         */
-        nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
  again:
        xfs_qm_mplist_lock(mp);
@@ -939,18 +938,10 @@ xfs_qm_sync(
                 * don't 'seem' to be dirty. ie. don't acquire dqlock.
                 * This is very similar to what xfs_sync does with inodes.
                 */
-                if (flags & SYNC_BDFLUSH) {
+                if (flags & SYNC_TRYLOCK) {
-                        if (! XFS_DQ_IS_DIRTY(dqp))
+                        if (!XFS_DQ_IS_DIRTY(dqp))
                                continue;
-                }
+                        if (!xfs_qm_dqlock_nowait(dqp))
-                if (nowait) {
-                        /*
-                         * Try to acquire the dquot lock. We are NOT out of
-                         * lock order, but we just don't want to wait for this
-                         * lock, unless somebody wanted us to.
-                         */
-                        if (! xfs_qm_dqlock_nowait(dqp))
                                continue;
                } else {
                        xfs_dqlock(dqp);
@@ -967,7 +958,7 @@ xfs_qm_sync(
                /* XXX a sentinel would be better */
                recl = XFS_QI_MPLRECLAIMS(mp);
                if (!xfs_dqflock_nowait(dqp)) {
-                        if (nowait) {
+                        if (flags & SYNC_TRYLOCK) {
                                xfs_dqunlock(dqp);
                                continue;
                        }
@@ -985,7 +976,6 @@ xfs_qm_sync(
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write
                 */
-                flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
                xfs_qm_mplist_unlock(mp);
                xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
                error = xfs_qm_dqflush(dqp, flush_flags);
@@ -2319,20 +2309,20 @@ xfs_qm_write_sb_changes(
 */
 int
 xfs_qm_vop_dqalloc(
-        xfs_mount_t     *mp,
+        struct xfs_inode        *ip,
-        xfs_inode_t     *ip,
+        uid_t                   uid,
-        uid_t           uid,
+        gid_t                   gid,
-        gid_t           gid,
+        prid_t                  prid,
-        prid_t          prid,
+        uint                    flags,
-        uint            flags,
+        struct xfs_dquot        **O_udqpp,
-        xfs_dquot_t     **O_udqpp,
+        struct xfs_dquot        **O_gdqpp)
-        xfs_dquot_t     **O_gdqpp)
 {
-        int             error;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_dquot_t     *uq, *gq;
+        struct xfs_dquot        *uq, *gq;
-        uint            lockflags;
+        int                     error;
+        uint                    lockflags;
-        if (!XFS_IS_QUOTA_ON(mp))
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
        lockflags = XFS_ILOCK_EXCL;
@@ -2346,8 +2336,8 @@ xfs_qm_vop_dqalloc(
         * if necessary. The dquot(s) will not be locked.
         */
        if (XFS_NOT_DQATTACHED(mp, ip)) {
-                if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
+                error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
-                                            XFS_QMOPT_ILOCKED))) {
+                if (error) {
                        xfs_iunlock(ip, lockflags);
                        return error;
                }
@@ -2469,6 +2459,7 @@ xfs_qm_vop_chown(
        uint            bfield = XFS_IS_REALTIME_INODE(ip) ?
                                 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
@@ -2508,13 +2499,13 @@ xfs_qm_vop_chown_reserve(
        xfs_dquot_t     *gdqp,
        uint            flags)
 {
-        int             error;
+        xfs_mount_t     *mp = ip->i_mount;
-        xfs_mount_t     *mp;
        uint            delblks, blkflags, prjflags = 0;
        xfs_dquot_t     *unresudq, *unresgdq, *delblksudq, *delblksgdq;
+        int             error;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        mp = ip->i_mount;
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        delblks = ip->i_delayed_blks;
@@ -2582,28 +2573,23 @@ xfs_qm_vop_chown_reserve(
 int
 xfs_qm_vop_rename_dqattach(
-        xfs_inode_t     **i_tab)
+        struct xfs_inode        **i_tab)
 {
-        xfs_inode_t     *ip;
+        struct xfs_mount        *mp = i_tab[0]->i_mount;
-        int             i;
+        int                     i;
-        int             error;
-        ip = i_tab[0];
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
-        if (! XFS_IS_QUOTA_ON(ip->i_mount))
                return 0;
-        if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+        for (i = 0; (i < 4 && i_tab[i]); i++) {
-                error = xfs_qm_dqattach(ip, 0);
+                struct xfs_inode        *ip = i_tab[i];
-                if (error)
+                int                     error;
-                        return error;
-        }
-        for (i = 1; (i < 4 && i_tab[i]); i++) {
                /*
                 * Watch out for duplicate entries in the table.
                 */
-                if ((ip = i_tab[i]) != i_tab[i-1]) {
+                if (i == 0 || ip != i_tab[i-1]) {
-                        if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+                        if (XFS_NOT_DQATTACHED(mp, ip)) {
                                error = xfs_qm_dqattach(ip, 0);
                                if (error)
                                        return error;
@@ -2614,17 +2600,19 @@ xfs_qm_vop_rename_dqattach(
 }
 void
-xfs_qm_vop_dqattach_and_dqmod_newinode(
+xfs_qm_vop_create_dqattach(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_dquot_t     *udqp,
+        struct xfs_dquot        *udqp,
-        xfs_dquot_t     *gdqp)
+        struct xfs_dquot        *gdqp)
 {
-        if (!XFS_IS_QUOTA_ON(tp->t_mountp))
+        struct xfs_mount        *mp = tp->t_mountp;
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        if (udqp) {
                xfs_dqlock(udqp);
@@ -2632,7 +2620,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
                xfs_dqunlock(udqp);
                ASSERT(ip->i_udquot == NULL);
                ip->i_udquot = udqp;
-                ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp));
+                ASSERT(XFS_IS_UQUOTA_ON(mp));
                ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
                xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
        }
@@ -2642,8 +2630,8 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
                xfs_dqunlock(gdqp);
                ASSERT(ip->i_gdquot == NULL);
                ip->i_gdquot = gdqp;
-                ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp));
+                ASSERT(XFS_IS_OQUOTA_ON(mp));
-                ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ?
+                ASSERT((XFS_IS_GQUOTA_ON(mp) ?
                        ip->i_d.di_gid : ip->i_d.di_projid) ==
                                be32_to_cpu(gdqp->q_core.d_id));
                xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index a371954cae1b..495564b8af38 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -127,8 +127,6 @@ typedef struct xfs_quotainfo {
 } xfs_quotainfo_t;
-extern xfs_dqtrxops_t   xfs_trans_dquot_ops;
 extern void     xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
 extern int      xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
                        xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
@@ -159,17 +157,11 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RTBWARNLIMIT     5
 extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern void             xfs_qm_mount_quotas(xfs_mount_t *);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
-extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern void             xfs_qm_unmount_quotas(xfs_mount_t *);
 extern int              xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
-extern int              xfs_qm_sync(xfs_mount_t *, int);
 /* dquot stuff */
 extern boolean_t        xfs_qm_dqalloc_incore(xfs_dquot_t **);
-extern int              xfs_qm_dqattach(xfs_inode_t *, uint);
-extern void             xfs_qm_dqdetach(xfs_inode_t *);
 extern int              xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void             xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
@@ -183,19 +175,6 @@ extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
 extern int              xfs_qm_scall_quotaon(xfs_mount_t *, uint);
 extern int              xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
-/* vop stuff */
-extern int              xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
-                                        uid_t, gid_t, prid_t, uint,
-                                        xfs_dquot_t **, xfs_dquot_t **);
-extern void             xfs_qm_vop_dqattach_and_dqmod_newinode(
-                                        xfs_trans_t *, xfs_inode_t *,
-                                        xfs_dquot_t *, xfs_dquot_t *);
-extern int              xfs_qm_vop_rename_dqattach(xfs_inode_t **);
-extern xfs_dquot_t *    xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *,
-                                        xfs_dquot_t **, xfs_dquot_t *);
-extern int              xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
-                                        xfs_dquot_t *, xfs_dquot_t *, uint);
 /* list stuff */
 extern void             xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
 extern void             xfs_qm_freelist_unlink(xfs_dquot_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 63037c689a4b..a5346630dfae 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -42,7 +42,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
@@ -84,7 +83,7 @@ xfs_fill_statvfs_from_dquot(
 * return a statvfs of the project, not the entire filesystem.
 * This makes such trees appear as if they are filesystems in themselves.
 */
-STATIC void
+void
 xfs_qm_statvfs(
        xfs_inode_t             *ip,
        struct kstatfs          *statp)
@@ -92,20 +91,13 @@ xfs_qm_statvfs(
        xfs_mount_t             *mp = ip->i_mount;
        xfs_dquot_t             *dqp;
-        if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
-            !((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
-                              (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
-                return;
        if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) {
-                xfs_disk_dquot_t        *dp = &dqp->q_core;
+                xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
-                xfs_fill_statvfs_from_dquot(statp, dp);
                xfs_qm_dqput(dqp);
        }
 }
-STATIC int
+int
 xfs_qm_newmount(
        xfs_mount_t     *mp,
        uint            *needquotamount,
@@ -114,9 +106,6 @@ xfs_qm_newmount(
        uint            quotaondisk;
        uint            uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
-        *quotaflags = 0;
-        *needquotamount = B_FALSE;
        quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
                                (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
@@ -179,66 +168,6 @@ xfs_qm_newmount(
        return 0;
 }
-STATIC int
-xfs_qm_endmount(
-        xfs_mount_t     *mp,
-        uint            needquotamount,
-        uint            quotaflags)
-{
-        if (needquotamount) {
-                ASSERT(mp->m_qflags == 0);
-                mp->m_qflags = quotaflags;
-                xfs_qm_mount_quotas(mp);
-        }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-        if (! (XFS_IS_QUOTA_ON(mp)))
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-        else
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-#endif
-#ifdef QUOTADEBUG
-        if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
-                cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
-#endif
-        return 0;
-}
-STATIC void
-xfs_qm_dqrele_null(
-        xfs_dquot_t     *dq)
-{
-        /*
-         * Called from XFS, where we always check first for a NULL dquot.
-         */
-        if (!dq)
-                return;
-        xfs_qm_dqrele(dq);
-}
-struct xfs_qmops xfs_qmcore_xfs = {
-        .xfs_qminit             = xfs_qm_newmount,
-        .xfs_qmdone             = xfs_qm_unmount_quotadestroy,
-        .xfs_qmmount            = xfs_qm_endmount,
-        .xfs_qmunmount          = xfs_qm_unmount_quotas,
-        .xfs_dqrele             = xfs_qm_dqrele_null,
-        .xfs_dqattach           = xfs_qm_dqattach,
-        .xfs_dqdetach           = xfs_qm_dqdetach,
-        .xfs_dqpurgeall         = xfs_qm_dqpurge_all,
-        .xfs_dqvopalloc         = xfs_qm_vop_dqalloc,
-        .xfs_dqvopcreate        = xfs_qm_vop_dqattach_and_dqmod_newinode,
-        .xfs_dqvoprename        = xfs_qm_vop_rename_dqattach,
-        .xfs_dqvopchown         = xfs_qm_vop_chown,
-        .xfs_dqvopchownresv     = xfs_qm_vop_chown_reserve,
-        .xfs_dqstatvfs          = xfs_qm_statvfs,
-        .xfs_dqsync             = xfs_qm_sync,
-        .xfs_dqtrxops           = &xfs_trans_dquot_ops,
-};
-EXPORT_SYMBOL(xfs_qmcore_xfs);
 void __init
 xfs_qm_init(void)
 {
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 709f5f545cf5..21b08c0396a1 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -42,7 +42,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index c7b66f6506ce..4e4276b956e8 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -45,7 +45,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -847,105 +846,55 @@ xfs_qm_export_flags(
 }
-/*
+STATIC int
- * Release all the dquots on the inodes in an AG.
+xfs_dqrele_inode(
- */
+        struct xfs_inode        *ip,
-STATIC void
+        struct xfs_perag        *pag,
-xfs_qm_dqrele_inodes_ag(
+        int                     flags)
-        xfs_mount_t     *mp,
-        int             ag,
-        uint            flags)
 {
-        xfs_inode_t     *ip = NULL;
+        int                     error;
-        xfs_perag_t     *pag = &mp->m_perag[ag];
-        int             first_index = 0;
-        int             nr_found;
-        do {
-                /*
-                 * use a gang lookup to find the next inode in the tree
-                 * as the tree is sparse and a gang lookup walks to find
-                 * the number of objects requested.
-                 */
-                read_lock(&pag->pag_ici_lock);
-                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-                                (void**)&ip, first_index, 1);
-                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /*
-                 * Update the index for the next lookup. Catch overflows
-                 * into the next AG range which can occur if we have inodes
-                 * in the last block of the AG and we are currently
-                 * pointing to the last inode.
-                 */
-                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /* skip quota inodes */
-                if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
-                        ASSERT(ip->i_udquot == NULL);
-                        ASSERT(ip->i_gdquot == NULL);
-                        read_unlock(&pag->pag_ici_lock);
-                        continue;
-                }
-                /*
+        /* skip quota inodes */
-                 * If we can't get a reference on the inode, it must be
+        if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
-                 * in reclaim. Leave it for the reclaim code to flush.
+                ASSERT(ip->i_udquot == NULL);
-                 */
+                ASSERT(ip->i_gdquot == NULL);
-                if (!igrab(VFS_I(ip))) {
-                        read_unlock(&pag->pag_ici_lock);
-                        continue;
-                }
                read_unlock(&pag->pag_ici_lock);
+                return 0;
+        }
-                /* avoid new inodes though we shouldn't find any here */
+        error = xfs_sync_inode_valid(ip, pag);
-                if (xfs_iflags_test(ip, XFS_INEW)) {
+        if (error)
-                        IRELE(ip);
+                return error;
-                        continue;
-                }
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+        if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
-                        xfs_qm_dqrele(ip->i_udquot);
+                xfs_qm_dqrele(ip->i_udquot);
-                        ip->i_udquot = NULL;
+                ip->i_udquot = NULL;
-                }
+        }
-                if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
+        if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
-                    ip->i_gdquot) {
+                xfs_qm_dqrele(ip->i_gdquot);
-                        xfs_qm_dqrele(ip->i_gdquot);
+                ip->i_gdquot = NULL;
-                        ip->i_gdquot = NULL;
+        }
-                }
+        xfs_iput(ip, XFS_ILOCK_EXCL);
-                xfs_iput(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
-        } while (nr_found);
+        return 0;
 }
 /*
 * Go thru all the inodes in the file system, releasing their dquots.
+ *
 * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
+ * AFTER this, in the case of quotaoff.
- * xfs_rootumount.
 */
 void
 xfs_qm_dqrele_all_inodes(
        struct xfs_mount *mp,
        uint             flags)
 {
-        int             i;
        ASSERT(mp->m_quotainfo);
-        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
-                if (!mp->m_perag[i].pag_ici_init)
-                        continue;
-                xfs_qm_dqrele_inodes_ag(mp, i, flags);
-        }
 }
 /*------------------------------------------------------------------------*/
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 447173bcf96d..97ac9640be98 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -42,7 +42,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
@@ -111,7 +110,7 @@ xfs_trans_log_dquot(
 * Carry forward whatever is left of the quota blk reservation to
 * the spanky new transaction
 */
-STATIC void
+void
 xfs_trans_dup_dqinfo(
        xfs_trans_t     *otp,
        xfs_trans_t     *ntp)
@@ -167,19 +166,17 @@ xfs_trans_dup_dqinfo(
 /*
 * Wrap around mod_dquot to account for both user and group quotas.
 */
-STATIC void
+void
 xfs_trans_mod_dquot_byino(
        xfs_trans_t     *tp,
        xfs_inode_t     *ip,
        uint            field,
        long            delta)
 {
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = tp->t_mountp;
-        ASSERT(tp);
-        mp = tp->t_mountp;
-        if (!XFS_IS_QUOTA_ON(mp) ||
+        if (!XFS_IS_QUOTA_RUNNING(mp) ||
+            !XFS_IS_QUOTA_ON(mp) ||
            ip->i_ino == mp->m_sb.sb_uquotino ||
            ip->i_ino == mp->m_sb.sb_gquotino)
                return;
@@ -229,6 +226,7 @@ xfs_trans_mod_dquot(
        xfs_dqtrx_t     *qtrx;
        ASSERT(tp);
+        ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
        qtrx = NULL;
        if (tp->t_dqinfo == NULL)
@@ -346,7 +344,7 @@ xfs_trans_dqlockedjoin(
 * Unreserve just the reservations done by this transaction.
 * dquot is still left locked at exit.
 */
-STATIC void
+void
 xfs_trans_apply_dquot_deltas(
        xfs_trans_t             *tp)
 {
@@ -357,7 +355,7 @@ xfs_trans_apply_dquot_deltas(
        long                    totalbdelta;
        long                    totalrtbdelta;
-        if (! (tp->t_flags & XFS_TRANS_DQ_DIRTY))
+        if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
                return;
        ASSERT(tp->t_dqinfo);
@@ -531,7 +529,7 @@ xfs_trans_apply_dquot_deltas(
 * we simply throw those away, since that's the expected behavior
 * when a transaction is curtailed without a commit.
 */
-STATIC void
+void
 xfs_trans_unreserve_and_mod_dquots(
        xfs_trans_t             *tp)
 {
@@ -768,7 +766,7 @@ xfs_trans_reserve_quota_bydquots(
 {
        int             resvd = 0, error;
-        if (!XFS_IS_QUOTA_ON(mp))
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
        if (tp && tp->t_dqinfo == NULL)
@@ -811,18 +809,17 @@ xfs_trans_reserve_quota_bydquots(
 * This doesn't change the actual usage, just the reservation.
 * The inode sent in is locked.
 */
-STATIC int
+int
 xfs_trans_reserve_quota_nblks(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_mount_t     *mp,
+        struct xfs_inode        *ip,
-        xfs_inode_t     *ip,
+        long                    nblks,
-        long            nblks,
+        long                    ninos,
-        long            ninos,
+        uint                    flags)
-        uint            flags)
 {
-        int             error;
+        struct xfs_mount        *mp = ip->i_mount;
-        if (!XFS_IS_QUOTA_ON(mp))
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
        if (XFS_IS_PQUOTA_ON(mp))
                flags |= XFS_QMOPT_ENOSPC;
@@ -831,7 +828,6 @@ xfs_trans_reserve_quota_nblks(
        ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
        ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
                                XFS_TRANS_DQ_RES_RTBLKS ||
               (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -840,11 +836,9 @@ xfs_trans_reserve_quota_nblks(
        /*
         * Reserve nblks against these dquots, with trans as the mediator.
         */
-        error = xfs_trans_reserve_quota_bydquots(tp, mp,
+        return xfs_trans_reserve_quota_bydquots(tp, mp,
-                                                 ip->i_udquot, ip->i_gdquot,
+                                                ip->i_udquot, ip->i_gdquot,
-                                                 nblks, ninos,
+                                                nblks, ninos, flags);
-                                                 flags);
-        return error;
 }
 /*
@@ -895,25 +889,15 @@ STATIC void
 xfs_trans_alloc_dqinfo(
        xfs_trans_t     *tp)
 {
-        (tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+        tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
 }
-STATIC void
+void
 xfs_trans_free_dqinfo(
        xfs_trans_t     *tp)
 {
        if (!tp->t_dqinfo)
                return;
-        kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo);
+        kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
-        (tp)->t_dqinfo = NULL;
+        tp->t_dqinfo = NULL;
 }
-xfs_dqtrxops_t  xfs_trans_dquot_ops = {
-        .qo_dup_dqinfo                  = xfs_trans_dup_dqinfo,
-        .qo_free_dqinfo                 = xfs_trans_free_dqinfo,
-        .qo_mod_dquot_byino             = xfs_trans_mod_dquot_byino,
-        .qo_apply_dquot_deltas          = xfs_trans_apply_dquot_deltas,
-        .qo_reserve_quota_nblks         = xfs_trans_reserve_quota_nblks,
-        .qo_reserve_quota_bydquots      = xfs_trans_reserve_quota_bydquots,
-        .qo_unreserve_and_mod_dquots    = xfs_trans_unreserve_and_mod_dquots,
-};
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
deleted file mode 100644
index a8cdd73999a4..000000000000
--- a/fs/xfs/xfs_acl.c
+++ /dev/null
@@ -1,874 +0,0 @@
-/*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_vnodeops.h"
-#include <linux/capability.h>
-#include <linux/posix_acl_xattr.h>
-STATIC int      xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
-STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
-STATIC void     xfs_acl_get_endian(xfs_acl_t *);
-STATIC int      xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
-STATIC int      xfs_acl_invalid(xfs_acl_t *);
-STATIC void     xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void     xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
-STATIC void     xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
-STATIC int      xfs_acl_allow_set(struct inode *, int);
-kmem_zone_t *xfs_acl_zone;
-/*
- * Test for existence of access ACL attribute as efficiently as possible.
- */
-int
-xfs_acl_vhasacl_access(
-        struct inode    *vp)
-{
-        int             error;
-        xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
-        return (error == 0);
-}
-/*
- * Test for existence of default ACL attribute as efficiently as possible.
- */
-int
-xfs_acl_vhasacl_default(
-        struct inode    *vp)
-{
-        int             error;
-        if (!S_ISDIR(vp->i_mode))
-                return 0;
-        xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
-        return (error == 0);
-}
-/*
- * Convert from extended attribute representation to in-memory for XFS.
- */
-STATIC int
-posix_acl_xattr_to_xfs(
-        posix_acl_xattr_header  *src,
-        size_t                  size,
-        xfs_acl_t               *dest)
-{
-        posix_acl_xattr_entry   *src_entry;
-        xfs_acl_entry_t         *dest_entry;
-        int                     n;
-        if (!src || !dest)
-                return EINVAL;
-        if (size < sizeof(posix_acl_xattr_header))
-                return EINVAL;
-        if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-                return EOPNOTSUPP;
-        memset(dest, 0, sizeof(xfs_acl_t));
-        dest->acl_cnt = posix_acl_xattr_count(size);
-        if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
-                return EINVAL;
-        /*
-         * acl_set_file(3) may request that we set default ACLs with
-         * zero length -- defend (gracefully) against that here.
-         */
-        if (!dest->acl_cnt)
-                return 0;
-        src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
-        dest_entry = &dest->acl_entry[0];
-        for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
-                dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
-                if (_ACL_PERM_INVALID(dest_entry->ae_perm))
-                        return EINVAL;
-                dest_entry->ae_tag  = le16_to_cpu(src_entry->e_tag);
-                switch(dest_entry->ae_tag) {
-                case ACL_USER:
-                case ACL_GROUP:
-                        dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
-                        break;
-                case ACL_USER_OBJ:
-                case ACL_GROUP_OBJ:
-                case ACL_MASK:
-                case ACL_OTHER:
-                        dest_entry->ae_id = ACL_UNDEFINED_ID;
-                        break;
-                default:
-                        return EINVAL;
-                }
-        }
-        if (xfs_acl_invalid(dest))
-                return EINVAL;
-        return 0;
-}
-/*
- * Comparison function called from xfs_sort().
- * Primary key is ae_tag, secondary key is ae_id.
- */
-STATIC int
-xfs_acl_entry_compare(
-        const void      *va,
-        const void      *vb)
-{
-        xfs_acl_entry_t *a = (xfs_acl_entry_t *)va,
-                        *b = (xfs_acl_entry_t *)vb;
-        if (a->ae_tag == b->ae_tag)
-                return (a->ae_id - b->ae_id);
-        return (a->ae_tag - b->ae_tag);
-}
-/*
- * Convert from in-memory XFS to extended attribute representation.
- */
-STATIC int
-posix_acl_xfs_to_xattr(
-        xfs_acl_t               *src,
-        posix_acl_xattr_header  *dest,
-        size_t                  size)
-{
-        int                     n;
-        size_t                  new_size = posix_acl_xattr_size(src->acl_cnt);
-        posix_acl_xattr_entry   *dest_entry;
-        xfs_acl_entry_t         *src_entry;
-        if (size < new_size)
-                return -ERANGE;
-        /* Need to sort src XFS ACL by <ae_tag,ae_id> */
-        xfs_sort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
-                 xfs_acl_entry_compare);
-        dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
-        dest_entry = &dest->a_entries[0];
-        src_entry = &src->acl_entry[0];
-        for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
-                dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
-                if (_ACL_PERM_INVALID(src_entry->ae_perm))
-                        return -EINVAL;
-                dest_entry->e_tag  = cpu_to_le16(src_entry->ae_tag);
-                switch (src_entry->ae_tag) {
-                case ACL_USER:
-                case ACL_GROUP:
-                        dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
-                                break;
-                case ACL_USER_OBJ:
-                case ACL_GROUP_OBJ:
-                case ACL_MASK:
-                case ACL_OTHER:
-                        dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
-                        break;
-                default:
-                        return -EINVAL;
-                }
-        }
-        return new_size;
-}
-int
-xfs_acl_vget(
-        struct inode    *vp,
-        void            *acl,
-        size_t          size,
-        int             kind)
-{
-        int                     error;
-        xfs_acl_t               *xfs_acl = NULL;
-        posix_acl_xattr_header  *ext_acl = acl;
-        int                     flags = 0;
-        if(size) {
-                if (!(_ACL_ALLOC(xfs_acl))) {
-                        error = ENOMEM;
-                        goto out;
-                }
-                memset(xfs_acl, 0, sizeof(xfs_acl_t));
-        } else
-                flags = ATTR_KERNOVAL;
-        xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error);
-        if (error)
-                goto out;
-        if (!size) {
-                error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
-        } else {
-                if (xfs_acl_invalid(xfs_acl)) {
-                        error = EINVAL;
-                        goto out;
-                }
-                if (kind == _ACL_TYPE_ACCESS)
-                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
-                error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
-        }
-out:
-        if(xfs_acl)
-                _ACL_FREE(xfs_acl);
-        return -error;
-}
-int
-xfs_acl_vremove(
-        struct inode    *vp,
-        int             kind)
-{
-        int             error;
-        error = xfs_acl_allow_set(vp, kind);
-        if (!error) {
-                error = xfs_attr_remove(XFS_I(vp),
-                                                kind == _ACL_TYPE_DEFAULT?
-                                                SGI_ACL_DEFAULT: SGI_ACL_FILE,
-                                                ATTR_ROOT);
-                if (error == ENOATTR)
-                        error = 0;      /* 'scool */
-        }
-        return -error;
-}
-int
-xfs_acl_vset(
-        struct inode            *vp,
-        void                    *acl,
-        size_t                  size,
-        int                     kind)
-{
-        posix_acl_xattr_header  *ext_acl = acl;
-        xfs_acl_t               *xfs_acl;
-        int                     error;
-        int                     basicperms = 0; /* more than std unix perms? */
-        if (!acl)
-                return -EINVAL;
-        if (!(_ACL_ALLOC(xfs_acl)))
-                return -ENOMEM;
-        error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
-        if (error) {
-                _ACL_FREE(xfs_acl);
-                return -error;
-        }
-        if (!xfs_acl->acl_cnt) {
-                _ACL_FREE(xfs_acl);
-                return 0;
-        }
-        error = xfs_acl_allow_set(vp, kind);
-        /* Incoming ACL exists, set file mode based on its value */
-        if (!error && kind == _ACL_TYPE_ACCESS)
-                error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
-        if (error)
-                goto out;
-        /*
-         * If we have more than std unix permissions, set up the actual attr.
-         * Otherwise, delete any existing attr.  This prevents us from
-         * having actual attrs for permissions that can be stored in the
-         * standard permission bits.
-         */
-        if (!basicperms) {
-                xfs_acl_set_attr(vp, xfs_acl, kind, &error);
-        } else {
-                error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
-        }
-out:
-        _ACL_FREE(xfs_acl);
-        return -error;
-}
-int
-xfs_acl_iaccess(
-        xfs_inode_t     *ip,
-        mode_t          mode,
-        cred_t          *cr)
-{
-        xfs_acl_t       *acl;
-        int             rval;
-        struct xfs_name acl_name = {SGI_ACL_FILE, SGI_ACL_FILE_SIZE};
-        if (!(_ACL_ALLOC(acl)))
-                return -1;
-        /* If the file has no ACL return -1. */
-        rval = sizeof(xfs_acl_t);
-        if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, ATTR_ROOT)) {
-                _ACL_FREE(acl);
-                return -1;
-        }
-        xfs_acl_get_endian(acl);
-        /* If the file has an empty ACL return -1. */
-        if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
-                _ACL_FREE(acl);
-                return -1;
-        }
-        /* Synchronize ACL with mode bits */
-        xfs_acl_sync_mode(ip->i_d.di_mode, acl);
-        rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
-        _ACL_FREE(acl);
-        return rval;
-}
-STATIC int
-xfs_acl_allow_set(
-        struct inode    *vp,
-        int             kind)
-{
-        if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
-                return EPERM;
-        if (kind == _ACL_TYPE_DEFAULT && !S_ISDIR(vp->i_mode))
-                return ENOTDIR;
-        if (vp->i_sb->s_flags & MS_RDONLY)
-                return EROFS;
-        if (XFS_I(vp)->i_d.di_uid != current_fsuid() && !capable(CAP_FOWNER))
-                return EPERM;
-        return 0;
-}
-/*
- * Note: cr is only used here for the capability check if the ACL test fails.
- *       It is not used to find out the credentials uid or groups etc, as was
- *       done in IRIX. It is assumed that the uid and groups for the current
- *       thread are taken from "current" instead of the cr parameter.
- */
-STATIC int
-xfs_acl_access(
-        uid_t           fuid,
-        gid_t           fgid,
-        xfs_acl_t       *fap,
-        mode_t          md,
-        cred_t          *cr)
-{
-        xfs_acl_entry_t matched;
-        int             i, allows;
-        int             maskallows = -1;        /* true, but not 1, either */
-        int             seen_userobj = 0;
-        matched.ae_tag = 0;     /* Invalid type */
-        matched.ae_perm = 0;
-        for (i = 0; i < fap->acl_cnt; i++) {
-                /*
-                 * Break out if we've got a user_obj entry or
-                 * a user entry and the mask (and have processed USER_OBJ)
-                 */
-                if (matched.ae_tag == ACL_USER_OBJ)
-                        break;
-                if (matched.ae_tag == ACL_USER) {
-                        if (maskallows != -1 && seen_userobj)
-                                break;
-                        if (fap->acl_entry[i].ae_tag != ACL_MASK &&
-                            fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
-                                continue;
-                }
-                /* True if this entry allows the requested access */
-                allows = ((fap->acl_entry[i].ae_perm & md) == md);
-                switch (fap->acl_entry[i].ae_tag) {
-                case ACL_USER_OBJ:
-                        seen_userobj = 1;
-                        if (fuid != current_fsuid())
-                                continue;
-                        matched.ae_tag = ACL_USER_OBJ;
-                        matched.ae_perm = allows;
-                        break;
-                case ACL_USER:
-                        if (fap->acl_entry[i].ae_id != current_fsuid())
-                                continue;
-                        matched.ae_tag = ACL_USER;
-                        matched.ae_perm = allows;
-                        break;
-                case ACL_GROUP_OBJ:
-                        if ((matched.ae_tag == ACL_GROUP_OBJ ||
-                            matched.ae_tag == ACL_GROUP) && !allows)
-                                continue;
-                        if (!in_group_p(fgid))
-                                continue;
-                        matched.ae_tag = ACL_GROUP_OBJ;
-                        matched.ae_perm = allows;
-                        break;
-                case ACL_GROUP:
-                        if ((matched.ae_tag == ACL_GROUP_OBJ ||
-                            matched.ae_tag == ACL_GROUP) && !allows)
-                                continue;
-                        if (!in_group_p(fap->acl_entry[i].ae_id))
-                                continue;
-                        matched.ae_tag = ACL_GROUP;
-                        matched.ae_perm = allows;
-                        break;
-                case ACL_MASK:
-                        maskallows = allows;
-                        break;
-                case ACL_OTHER:
-                        if (matched.ae_tag != 0)
-                                continue;
-                        matched.ae_tag = ACL_OTHER;
-                        matched.ae_perm = allows;
-                        break;
-                }
-        }
-        /*
-         * First possibility is that no matched entry allows access.
-         * The capability to override DAC may exist, so check for it.
-         */
-        switch (matched.ae_tag) {
-        case ACL_OTHER:
-        case ACL_USER_OBJ:
-                if (matched.ae_perm)
-                        return 0;
-                break;
-        case ACL_USER:
-        case ACL_GROUP_OBJ:
-        case ACL_GROUP:
-                if (maskallows && matched.ae_perm)
-                        return 0;
-                break;
-        case 0:
-                break;
-        }
-        /* EACCES tells generic_permission to check for capability overrides */
-        return EACCES;
-}
-/*
- * ACL validity checker.
- *   This acl validation routine checks each ACL entry read in makes sense.
- */
-STATIC int
-xfs_acl_invalid(
-        xfs_acl_t       *aclp)
-{
-        xfs_acl_entry_t *entry, *e;
-        int             user = 0, group = 0, other = 0, mask = 0;
-        int             mask_required = 0;
-        int             i, j;
-        if (!aclp)
-                goto acl_invalid;
-        if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
-                goto acl_invalid;
-        for (i = 0; i < aclp->acl_cnt; i++) {
-                entry = &aclp->acl_entry[i];
-                switch (entry->ae_tag) {
-                case ACL_USER_OBJ:
-                        if (user++)
-                                goto acl_invalid;
-                        break;
-                case ACL_GROUP_OBJ:
-                        if (group++)
-                                goto acl_invalid;
-                        break;
-                case ACL_OTHER:
-                        if (other++)
-                                goto acl_invalid;
-                        break;
-                case ACL_USER:
-                case ACL_GROUP:
-                        for (j = i + 1; j < aclp->acl_cnt; j++) {
-                                e = &aclp->acl_entry[j];
-                                if (e->ae_id == entry->ae_id &&
-                                    e->ae_tag == entry->ae_tag)
-                                        goto acl_invalid;
-                        }
-                        mask_required++;
-                        break;
-                case ACL_MASK:
-                        if (mask++)
-                                goto acl_invalid;
-                        break;
-                default:
-                        goto acl_invalid;
-                }
-        }
-        if (!user || !group || !other || (mask_required && !mask))
-                goto acl_invalid;
-        else
-                return 0;
-acl_invalid:
-        return EINVAL;
-}
-/*
- * Do ACL endian conversion.
- */
-STATIC void
-xfs_acl_get_endian(
-        xfs_acl_t       *aclp)
-{
-        xfs_acl_entry_t *ace, *end;
-        INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-        end = &aclp->acl_entry[0]+aclp->acl_cnt;
-        for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
-                INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
-                INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
-                INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
-        }
-}
-/*
- * Get the ACL from the EA and do endian conversion.
- */
-STATIC void
-xfs_acl_get_attr(
-        struct inode    *vp,
-        xfs_acl_t       *aclp,
-        int             kind,
-        int             flags,
-        int             *error)
-{
-        int             len = sizeof(xfs_acl_t);
-        ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
-        flags |= ATTR_ROOT;
-        *error = xfs_attr_get(XFS_I(vp),
-                                        kind == _ACL_TYPE_ACCESS ?
-                                        SGI_ACL_FILE : SGI_ACL_DEFAULT,
-                                        (char *)aclp, &len, flags);
-        if (*error || (flags & ATTR_KERNOVAL))
-                return;
-        xfs_acl_get_endian(aclp);
-}
-/*
- * Set the EA with the ACL and do endian conversion.
- */
-STATIC void
-xfs_acl_set_attr(
-        struct inode    *vp,
-        xfs_acl_t       *aclp,
-        int             kind,
-        int             *error)
-{
-        xfs_acl_entry_t *ace, *newace, *end;
-        xfs_acl_t       *newacl;
-        int             len;
-        if (!(_ACL_ALLOC(newacl))) {
-                *error = ENOMEM;
-                return;
-        }
-        len = sizeof(xfs_acl_t) -
-              (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
-        end = &aclp->acl_entry[0]+aclp->acl_cnt;
-        for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
-             ace < end;
-             ace++, newace++) {
-                INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
-                INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
-                INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
-        }
-        INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-        *error = xfs_attr_set(XFS_I(vp),
-                                kind == _ACL_TYPE_ACCESS ?
-                                SGI_ACL_FILE: SGI_ACL_DEFAULT,
-                                (char *)newacl, len, ATTR_ROOT);
-        _ACL_FREE(newacl);
-}
-int
-xfs_acl_vtoacl(
-        struct inode    *vp,
-        xfs_acl_t       *access_acl,
-        xfs_acl_t       *default_acl)
-{
-        int             error = 0;
-        if (access_acl) {
-                /*
-                 * Get the Access ACL and the mode.  If either cannot
-                 * be obtained for some reason, invalidate the access ACL.
-                 */
-                xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
-                if (error)
-                        access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
-                else /* We have a good ACL and the file mode, synchronize. */
-                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
-        }
-        if (default_acl) {
-                xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
-                if (error)
-                        default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
-        }
-        return error;
-}
-/*
- * This function retrieves the parent directory's acl, processes it
- * and lets the child inherit the acl(s) that it should.
- */
-int
-xfs_acl_inherit(
-        struct inode    *vp,
-        mode_t          mode,
-        xfs_acl_t       *pdaclp)
-{
-        xfs_acl_t       *cacl;
-        int             error = 0;
-        int             basicperms = 0;
-        /*
-         * If the parent does not have a default ACL, or it's an
-         * invalid ACL, we're done.
-         */
-        if (!vp)
-                return 0;
-        if (!pdaclp || xfs_acl_invalid(pdaclp))
-                return 0;
-        /*
-         * Copy the default ACL of the containing directory to
-         * the access ACL of the new file and use the mode that
-         * was passed in to set up the correct initial values for
-         * the u::,g::[m::], and o:: entries.  This is what makes
-         * umask() "work" with ACL's.
-         */
-        if (!(_ACL_ALLOC(cacl)))
-                return ENOMEM;
-        memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
-        xfs_acl_filter_mode(mode, cacl);
-        error = xfs_acl_setmode(vp, cacl, &basicperms);
-        if (error)
-                goto out_error;
-        /*
-         * Set the Default and Access ACL on the file.  The mode is already
-         * set on the file, so we don't need to worry about that.
-         *
-         * If the new file is a directory, its default ACL is a copy of
-         * the containing directory's default ACL.
-         */
-        if (S_ISDIR(vp->i_mode))
-                xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
-        if (!error && !basicperms)
-                xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
-out_error:
-        _ACL_FREE(cacl);
-        return error;
-}
-/*
- * Set up the correct mode on the file based on the supplied ACL.  This
- * makes sure that the mode on the file reflects the state of the
- * u::,g::[m::], and o:: entries in the ACL.  Since the mode is where
- * the ACL is going to get the permissions for these entries, we must
- * synchronize the mode whenever we set the ACL on a file.
- */
-STATIC int
-xfs_acl_setmode(
-        struct inode    *vp,
-        xfs_acl_t       *acl,
-        int             *basicperms)
-{
-        struct iattr    iattr;
-        xfs_acl_entry_t *ap;
-        xfs_acl_entry_t *gap = NULL;
-        int             i, nomask = 1;
-        *basicperms = 1;
-        if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
-                return 0;
-        /*
-         * Copy the u::, g::, o::, and m:: bits from the ACL into the
-         * mode.  The m:: bits take precedence over the g:: bits.
-         */
-        iattr.ia_valid = ATTR_MODE;
-        iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
-        iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
-        ap = acl->acl_entry;
-        for (i = 0; i < acl->acl_cnt; ++i) {
-                switch (ap->ae_tag) {
-                case ACL_USER_OBJ:
-                        iattr.ia_mode |= ap->ae_perm << 6;
-                        break;
-                case ACL_GROUP_OBJ:
-                        gap = ap;
-                        break;
-                case ACL_MASK:  /* more than just standard modes */
-                        nomask = 0;
-                        iattr.ia_mode |= ap->ae_perm << 3;
-                        *basicperms = 0;
-                        break;
-                case ACL_OTHER:
-                        iattr.ia_mode |= ap->ae_perm;
-                        break;
-                default:        /* more than just standard modes */
-                        *basicperms = 0;
-                        break;
-                }
-                ap++;
-        }
-        /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
-        if (gap && nomask)
-                iattr.ia_mode |= gap->ae_perm << 3;
-        return xfs_setattr(XFS_I(vp), &iattr, 0);
-}
-/*
- * The permissions for the special ACL entries (u::, g::[m::], o::) are
- * actually stored in the file mode (if there is both a group and a mask,
- * the group is stored in the ACL entry and the mask is stored on the file).
- * This allows the mode to remain automatically in sync with the ACL without
- * the need for a call-back to the ACL system at every point where the mode
- * could change.  This function takes the permissions from the specified mode
- * and places it in the supplied ACL.
- *
- * This implementation draws its validity from the fact that, when the ACL
- * was assigned, the mode was copied from the ACL.
- * If the mode did not change, therefore, the mode remains exactly what was
- * taken from the special ACL entries at assignment.
- * If a subsequent chmod() was done, the POSIX spec says that the change in
- * mode must cause an update to the ACL seen at user level and used for
- * access checks.  Before and after a mode change, therefore, the file mode
- * most accurately reflects what the special ACL entries should permit/deny.
- *
- * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
- *         the existing mode bits will override whatever is in the
- *         ACL. Similarly, if there is a pre-existing ACL that was
- *         never in sync with its mode (owing to a bug in 6.5 and
- *         before), it will now magically (or mystically) be
- *         synchronized.  This could cause slight astonishment, but
- *         it is better than inconsistent permissions.
- *
- * The supplied ACL is a template that may contain any combination
- * of special entries.  These are treated as place holders when we fill
- * out the ACL.  This routine does not add or remove special entries, it
- * simply unites each special entry with its associated set of permissions.
- */
-STATIC void
-xfs_acl_sync_mode(
-        mode_t          mode,
-        xfs_acl_t       *acl)
-{
-        int             i, nomask = 1;
-        xfs_acl_entry_t *ap;
-        xfs_acl_entry_t *gap = NULL;
-        /*
-         * Set ACL entries. POSIX1003.1eD16 requires that the MASK
-         * be set instead of the GROUP entry, if there is a MASK.
-         */
-        for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
-                switch (ap->ae_tag) {
-                case ACL_USER_OBJ:
-                        ap->ae_perm = (mode >> 6) & 0x7;
-                        break;
-                case ACL_GROUP_OBJ:
-                        gap = ap;
-                        break;
-                case ACL_MASK:
-                        nomask = 0;
-                        ap->ae_perm = (mode >> 3) & 0x7;
-                        break;
-                case ACL_OTHER:
-                        ap->ae_perm = mode & 0x7;
-                        break;
-                default:
-                        break;
-                }
-        }
-        /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
-        if (gap && nomask)
-                gap->ae_perm = (mode >> 3) & 0x7;
-}
-/*
- * When inheriting an Access ACL from a directory Default ACL,
- * the ACL bits are set to the intersection of the ACL default
- * permission bits and the file permission bits in mode. If there
- * are no permission bits on the file then we must not give them
- * the ACL. This is what what makes umask() work with ACLs.
- */
-STATIC void
-xfs_acl_filter_mode(
-        mode_t          mode,
-        xfs_acl_t       *acl)
-{
-        int             i, nomask = 1;
-        xfs_acl_entry_t *ap;
-        xfs_acl_entry_t *gap = NULL;
-        /*
-         * Set ACL entries. POSIX1003.1eD16 requires that the MASK
-         * be merged with GROUP entry, if there is a MASK.
-         */
-        for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
-                switch (ap->ae_tag) {
-                case ACL_USER_OBJ:
-                        ap->ae_perm &= (mode >> 6) & 0x7;
-                        break;
-                case ACL_GROUP_OBJ:
-                        gap = ap;
-                        break;
-                case ACL_MASK:
-                        nomask = 0;
-                        ap->ae_perm &= (mode >> 3) & 0x7;
-                        break;
-                case ACL_OTHER:
-                        ap->ae_perm &= mode & 0x7;
-                        break;
-                default:
-                        break;
-                }
-        }
-        /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
-        if (gap && nomask)
-                gap->ae_perm &= (mode >> 3) & 0x7;
-}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 642f1db4def4..947b150df8ed 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -18,81 +18,44 @@
 #ifndef __XFS_ACL_H__
 #define __XFS_ACL_H__
-/*
+struct inode;
- * Access Control Lists
+struct posix_acl;
- */
+struct xfs_inode;
-typedef __uint16_t      xfs_acl_perm_t;
-typedef __int32_t       xfs_acl_tag_t;
-typedef __int32_t       xfs_acl_id_t;
 #define XFS_ACL_MAX_ENTRIES 25
 #define XFS_ACL_NOT_PRESENT (-1)
-typedef struct xfs_acl_entry {
+/* On-disk XFS access control list structure */
-        xfs_acl_tag_t   ae_tag;
+struct xfs_acl {
-        xfs_acl_id_t    ae_id;
+        __be32          acl_cnt;
-        xfs_acl_perm_t  ae_perm;
+        struct xfs_acl_entry {
-} xfs_acl_entry_t;
+                __be32  ae_tag;
+                __be32  ae_id;
-typedef struct xfs_acl {
+                __be16  ae_perm;
-        __int32_t       acl_cnt;
+        } acl_entry[XFS_ACL_MAX_ENTRIES];
-        xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES];
+};
-} xfs_acl_t;
 /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE    "SGI_ACL_FILE"
+#define SGI_ACL_FILE            "SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
+#define SGI_ACL_DEFAULT         "SGI_ACL_DEFAULT"
 #define SGI_ACL_FILE_SIZE       (sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
-#define _ACL_TYPE_ACCESS        1
-#define _ACL_TYPE_DEFAULT       2
 #ifdef CONFIG_XFS_POSIX_ACL
+extern int xfs_check_acl(struct inode *inode, int mask);
-struct vattr;
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-struct xfs_inode;
+extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
+extern int xfs_acl_chmod(struct inode *inode);
-extern struct kmem_zone *xfs_acl_zone;
+extern int posix_acl_access_exists(struct inode *inode);
-#define xfs_acl_zone_init(zone, name)   \
+extern int posix_acl_default_exists(struct inode *inode);
-                (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
-#define xfs_acl_zone_destroy(zone)      kmem_zone_destroy(zone)
+extern struct xattr_handler xfs_xattr_system_handler;
-extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
-extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(struct inode *);
-extern int xfs_acl_vhasacl_default(struct inode *);
-extern int xfs_acl_vset(struct inode *, void *, size_t, int);
-extern int xfs_acl_vget(struct inode *, void *, size_t, int);
-extern int xfs_acl_vremove(struct inode *, int);
-#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
-#define _ACL_INHERIT(c,m,d)     (xfs_acl_inherit(c,m,d))
-#define _ACL_GET_ACCESS(pv,pa)  (xfs_acl_vtoacl(pv,pa,NULL) == 0)
-#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0)
-#define _ACL_ACCESS_EXISTS      xfs_acl_vhasacl_access
-#define _ACL_DEFAULT_EXISTS     xfs_acl_vhasacl_default
-#define _ACL_ALLOC(a)           ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
-#define _ACL_FREE(a)            ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0)
 #else
-#define xfs_acl_zone_init(zone,name)
+# define xfs_check_acl                                  NULL
-#define xfs_acl_zone_destroy(zone)
+# define xfs_get_acl(inode, type)                       NULL
-#define xfs_acl_vset(v,p,sz,t)  (-EOPNOTSUPP)
+# define xfs_inherit_acl(inode, default_acl)            0
-#define xfs_acl_vget(v,p,sz,t)  (-EOPNOTSUPP)
+# define xfs_acl_chmod(inode)                           0
-#define xfs_acl_vremove(v,t)    (-EOPNOTSUPP)
+# define posix_acl_access_exists(inode)                 0
-#define xfs_acl_vhasacl_access(v)       (0)
+# define posix_acl_default_exists(inode)                0
-#define xfs_acl_vhasacl_default(v)      (0)
+#endif /* CONFIG_XFS_POSIX_ACL */
-#define _ACL_ALLOC(a)           (1)     /* successfully allocate nothing */
-#define _ACL_FREE(a)            ((void)0)
-#define _ACL_INHERIT(c,m,d)     (0)
-#define _ACL_GET_ACCESS(pv,pa)  (0)
-#define _ACL_GET_DEFAULT(pv,pd) (0)
-#define _ACL_ACCESS_EXISTS      (NULL)
-#define _ACL_DEFAULT_EXISTS     (NULL)
-#endif
 #endif  /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index c8641f713caa..f24b50b68d03 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -212,6 +212,8 @@ typedef struct xfs_perag
 /*
 * tags for inode radix tree
 */
+#define XFS_ICI_NO_TAG          (-1)    /* special flag for an untagged lookup
+                                           in xfs_inode_ag_iterator */
 #define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
 #define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 53d5e70d1360..0902249354a0 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -73,28 +73,6 @@ static inline void be64_add_cpu(__be64 *a, __s64 b)
 #endif  /* __KERNEL__ */
-/* do we need conversion? */
-#define ARCH_NOCONVERT 1
-#ifdef XFS_NATIVE_HOST
-# define ARCH_CONVERT   ARCH_NOCONVERT
-#else
-# define ARCH_CONVERT   0
-#endif
-/* generic swapping macros */
-#ifndef HAVE_SWABMACROS
-#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var))))
-#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var))))
-#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var))))
-#endif
-#define INT_SWAP(type, var) \
-    ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \
-    ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \
-    ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \
-    (var))))
 /*
 * get and set integers from potentially unaligned locations
 */
@@ -107,16 +85,6 @@ static inline void be64_add_cpu(__be64 *a, __s64 b)
        ((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
    }
-/* does not return a value */
-#define INT_SET(reference,arch,valueref) \
-    (__builtin_constant_p(valueref) ? \
-        (void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \
-        (void)( \
-            ((reference) = (valueref)), \
-            ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \
-        ) \
-    )
 /*
 * In directories inode numbers are stored as unaligned arrays of unsigned
 * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 5fde1654b430..4ece1906bd41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -45,7 +45,6 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
-#include "xfs_acl.h"
 #include "xfs_rw.h"
 #include "xfs_vnodeops.h"
@@ -249,8 +248,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        /*
         * Attach the dquots to the inode.
         */
-        if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
+        error = xfs_qm_dqattach(dp, 0);
-                return (error);
+        if (error)
+                return error;
        /*
         * If the inode doesn't have an attribute fork, add one.
@@ -311,7 +311,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL);
-        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0,
+        error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                                       XFS_QMOPT_RES_REGBLKS);
        if (error) {
@@ -501,8 +501,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
        /*
         * Attach the dquots to the inode.
         */
-        if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
+        error = xfs_qm_dqattach(dp, 0);
-                return (error);
+        if (error)
+                return error;
        /*
         * Start our first transaction of the day.
@@ -2009,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                        dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
                        blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
                        error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
-                                             blkcnt, XFS_BUF_LOCK, &bp);
+                                             blkcnt,
+                                             XFS_BUF_LOCK | XBF_DONT_BLOCK,
+                                             &bp);
                        if (error)
                                return(error);
@@ -2140,8 +2143,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
                blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-                bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
+                bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
-                                                        blkcnt, XFS_BUF_LOCK);
+                                       XFS_BUF_LOCK | XBF_DONT_BLOCK);
                ASSERT(bp);
                ASSERT(!XFS_BUF_GETERROR(bp));
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ca7c6005a487..8ee5b5a76a2a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2691,7 +2691,7 @@ xfs_bmap_rtalloc(
                 * Adjust the disk quota also. This was reserved
                 * earlier.
                 */
-                XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
+                xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
                        ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
                                        XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
        } else {
@@ -2995,7 +2995,7 @@ xfs_bmap_btalloc(
                 * Adjust the disk quota also. This was reserved
                 * earlier.
                 */
-                XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
+                xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
                        ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
                                        XFS_TRANS_DQ_BCOUNT,
                        (long) args.len);
@@ -3066,7 +3066,7 @@ xfs_bmap_btree_to_extents(
                return error;
        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
        ip->i_d.di_nblocks--;
-        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
        xfs_trans_binval(tp, cbp);
        if (cur->bc_bufs[0] == cbp)
                cur->bc_bufs[0] = NULL;
@@ -3386,7 +3386,7 @@ xfs_bmap_del_extent(
         * Adjust quota data.
         */
        if (qfield)
-                XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, qfield, (long)-nblks);
+                xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
        /*
         * Account for change in delayed indirect blocks.
@@ -3523,7 +3523,7 @@ xfs_bmap_extents_to_btree(
        *firstblock = cur->bc_private.b.firstblock = args.fsbno;
        cur->bc_private.b.allocated++;
        ip->i_d.di_nblocks++;
-        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
        abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
        /*
         * Fill in the child block.
@@ -3690,7 +3690,7 @@ xfs_bmap_local_to_extents(
                XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork);
                XFS_IFORK_NEXT_SET(ip, whichfork, 1);
                ip->i_d.di_nblocks = 1;
-                XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
+                xfs_trans_mod_dquot_byino(tp, ip,
                        XFS_TRANS_DQ_BCOUNT, 1L);
                flags |= xfs_ilog_fext(whichfork);
        } else {
@@ -4048,7 +4048,7 @@ xfs_bmap_add_attrfork(
                        XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
                goto error0;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, blks, 0, rsvd ?
+        error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
                        XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                        XFS_QMOPT_RES_REGBLKS);
        if (error) {
@@ -4983,10 +4983,11 @@ xfs_bmapi(
                                 * adjusted later.  We return if we haven't
                                 * allocated blocks already inside this loop.
                                 */
-                                if ((error = XFS_TRANS_RESERVE_QUOTA_NBLKS(
+                                error = xfs_trans_reserve_quota_nblks(
-                                                mp, NULL, ip, (long)alen, 0,
+                                                NULL, ip, (long)alen, 0,
                                                rt ? XFS_QMOPT_RES_RTBLKS :
-                                                     XFS_QMOPT_RES_REGBLKS))) {
+                                                     XFS_QMOPT_RES_REGBLKS);
+                                if (error) {
                                        if (n == 0) {
                                                *nmap = 0;
                                                ASSERT(cur == NULL);
@@ -5035,8 +5036,8 @@ xfs_bmapi(
                                        if (XFS_IS_QUOTA_ON(mp))
                                                /* unreserve the blocks now */
                                                (void)
-                                                XFS_TRANS_UNRESERVE_QUOTA_NBLKS(
+                                                xfs_trans_unreserve_quota_nblks(
-                                                        mp, NULL, ip,
+                                                        NULL, ip,
                                                        (long)alen, 0, rt ?
                                                        XFS_QMOPT_RES_RTBLKS :
                                                        XFS_QMOPT_RES_REGBLKS);
@@ -5691,14 +5692,14 @@ xfs_bunmapi(
                                do_div(rtexts, mp->m_sb.sb_rextsize);
                                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
                                                (int64_t)rtexts, rsvd);
-                                (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
+                                (void)xfs_trans_reserve_quota_nblks(NULL,
-                                        NULL, ip, -((long)del.br_blockcount), 0,
+                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
                                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
                                                (int64_t)del.br_blockcount, rsvd);
-                                (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
+                                (void)xfs_trans_reserve_quota_nblks(NULL,
-                                        NULL, ip, -((long)del.br_blockcount), 0,
+                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_REGBLKS);
                        }
                        ip->i_delayed_blks -= del.br_blockcount;
@@ -6008,7 +6009,7 @@ xfs_getbmap(
         */
        error = ENOMEM;
        subnex = 16;
-        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
+        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
        if (!map)
                goto out_unlock_ilock;
@@ -6085,6 +6086,7 @@ xfs_getbmap(
                        break;
        }
+        kmem_free(out);
        return error;
 }
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 0760d352586f..5c1ade06578e 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -590,7 +590,7 @@ xfs_bmbt_alloc_block(
        cur->bc_private.b.allocated++;
        cur->bc_private.b.ip->i_d.di_nblocks++;
        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+        xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
                        XFS_TRANS_DQ_BCOUNT, 1L);
        new->l = cpu_to_be64(args.fsbno);
@@ -618,7 +618,7 @@ xfs_bmbt_free_block(
        ip->i_d.di_nblocks--;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
        xfs_trans_binval(tp, bp);
        return 0;
 }
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e9df99574829..26717388acf5 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -120,8 +120,8 @@ xfs_btree_check_sblock(
                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
                if (bp)
                        xfs_buftrace("SBTREE ERROR", bp);
-                XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
-                                 cur->bc_mp);
+                        XFS_ERRLEVEL_LOW, cur->bc_mp, block);
                return XFS_ERROR(EFSCORRUPTED);
        }
        return 0;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9ff6e57a5075..2847bbc1c534 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone;		/* dabuf zone */
 xfs_da_state_t *
 xfs_da_state_alloc(void)
 {
-        return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
+        return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
 }
 /*
@@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
        int             off;
        if (nbuf == 1)
-                dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
+                dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
        else
-                dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
+                dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
        dabuf->dirty = 0;
 #ifdef XFS_DABUF_DEBUG
        dabuf->ra = ra;
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index c657bec6d951..bb1d58eb3982 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -256,7 +256,7 @@ xfs_dir_cilookup_result(
                                        !(args->op_flags & XFS_DA_OP_CILOOKUP))
                return EEXIST;
-        args->value = kmem_alloc(len, KM_MAYFAIL);
+        args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
        if (!args->value)
                return ENOMEM;
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 6c87c8f304ef..edf8bdf4141f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -542,10 +542,8 @@ xfs_filestream_associate(
         * waiting for the lock because someone else is waiting on the lock we
         * hold and we cannot drop that as we are in a transaction here.
         *
-         * Lucky for us, this inversion is rarely a problem because it's a
+         * Lucky for us, this inversion is not a problem because it's a
-         * directory inode that we are trying to lock here and that means the
+         * directory inode that we are trying to lock here.
-         * only place that matters is xfs_sync_inodes() and SYNC_DELWRI is
-         * used. i.e. freeze, remount-ro, quotasync or unmount.
         *
         * So, if we can't get the iolock without sleeping then just give up
         */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f7c06fac8229..c4ea51b55dce 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -239,10 +239,13 @@ typedef struct xfs_fsop_resblks {
 * Minimum and maximum sizes need for growth checks
 */
 #define XFS_MIN_AG_BLOCKS       64
-#define XFS_MIN_LOG_BLOCKS      512
+#define XFS_MIN_LOG_BLOCKS      512ULL
-#define XFS_MAX_LOG_BLOCKS      (64 * 1024)
+#define XFS_MAX_LOG_BLOCKS      (1024 * 1024ULL)
-#define XFS_MIN_LOG_BYTES       (256 * 1024)
+#define XFS_MIN_LOG_BYTES       (10 * 1024 * 1024ULL)
-#define XFS_MAX_LOG_BYTES       (128 * 1024 * 1024)
+/* keep the maximum size under 2^31 by a small amount */
+#define XFS_MAX_LOG_BYTES \
+        ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
 /*
 * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cbd451bb4848..2d0b3e1da9e6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,17 +167,25 @@ xfs_growfs_data_private(
        new = nb - mp->m_sb.sb_dblocks;
        oagcount = mp->m_sb.sb_agcount;
        if (nagcount > oagcount) {
+                void *new_perag, *old_perag;
                xfs_filestream_flush(mp);
+                new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
+                                        KM_MAYFAIL);
+                if (!new_perag)
+                        return XFS_ERROR(ENOMEM);
                down_write(&mp->m_peraglock);
-                mp->m_perag = kmem_realloc(mp->m_perag,
+                memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
-                        sizeof(xfs_perag_t) * nagcount,
+                old_perag = mp->m_perag;
-                        sizeof(xfs_perag_t) * oagcount,
+                mp->m_perag = new_perag;
-                        KM_SLEEP);
-                memset(&mp->m_perag[oagcount], 0,
-                        (nagcount - oagcount) * sizeof(xfs_perag_t));
                mp->m_flags |= XFS_MOUNT_32BITINODES;
                nagimax = xfs_initialize_perag(mp, nagcount);
                up_write(&mp->m_peraglock);
+                kmem_free(old_perag);
        }
        tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
        tp->t_flags |= XFS_TRANS_RESERVE;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 89b81eedce6a..ecbf8b4d2e2e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
+#include "xfs_acl.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -63,6 +64,10 @@ xfs_inode_alloc(
        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
        if (!ip)
                return NULL;
+        if (inode_init_always(mp->m_super, VFS_I(ip))) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return NULL;
+        }
        ASSERT(atomic_read(&ip->i_iocount) == 0);
        ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -104,17 +109,6 @@ xfs_inode_alloc(
 #ifdef XFS_DIR2_TRACE
        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
-        /*
-        * Now initialise the VFS inode. We do this after the xfs_inode
-        * initialisation as internal failures will result in ->destroy_inode
-        * being called and that will pass down through the reclaim path and
-        * free the XFS inode. This path requires the XFS inode to already be
-        * initialised. Hence if this call fails, the xfs_inode has already
-        * been freed and we should not reference it at all in the error
-        * handling.
-        */
-        if (!inode_init_always(mp->m_super, VFS_I(ip)))
-                return NULL;
        /* prevent anyone from using this yet */
        VFS_I(ip)->i_state = I_NEW|I_LOCK;
@@ -122,6 +116,71 @@ xfs_inode_alloc(
        return ip;
 }
+STATIC void
+xfs_inode_free(
+        struct xfs_inode        *ip)
+{
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
+        }
+        if (ip->i_afp)
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+#ifdef XFS_INODE_TRACE
+        ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+        ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(ip->i_dir_trace);
+#endif
+        if (ip->i_itemp) {
+                /*
+                 * Only if we are shutting down the fs will we see an
+                 * inode still in the AIL. If it is there, we should remove
+                 * it to prevent a use-after-free from occurring.
+                 */
+                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
+                struct xfs_ail  *ailp = lip->li_ailp;
+                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
+                if (lip->li_flags & XFS_LI_IN_AIL) {
+                        spin_lock(&ailp->xa_lock);
+                        if (lip->li_flags & XFS_LI_IN_AIL)
+                                xfs_trans_ail_delete(ailp, lip);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                }
+                xfs_inode_item_destroy(ip);
+                ip->i_itemp = NULL;
+        }
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(completion_done(&ip->i_flush));
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 /*
 * Check the validity of the inode we just found it the cache
 */
@@ -132,80 +191,82 @@ xfs_iget_cache_hit(
        int                     flags,
        int                     lock_flags) __releases(pag->pag_ici_lock)
 {
+        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
-        int                     error = EAGAIN;
+        int                     error;
+        spin_lock(&ip->i_flags_lock);
        /*
-         * If INEW is set this inode is being set up
+         * If we are racing with another cache hit that is currently
-         * If IRECLAIM is set this inode is being torn down
+         * instantiating this inode or currently recycling it out of
-         * Pause and try again.
+         * reclaimabe state, wait for the initialisation to complete
+         * before continuing.
+         *
+         * XXX(hch): eventually we should do something equivalent to
+         *           wait_on_inode to wait for these flags to be cleared
+         *           instead of polling for it.
         */
-        if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
                goto out_error;
        }
-        /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
+        /*
-        if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+         * If lookup is racing with unlink return an error immediately.
+         */
-                /*
+        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                 * If lookup is racing with unlink, then we should return an
+                error = ENOENT;
-                 * error immediately so we don't remove it from the reclaim
+                goto out_error;
-                 * list and potentially leak the inode.
+        }
-                 */
-                if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                        error = ENOENT;
-                        goto out_error;
-                }
+        /*
+         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+         * Need to carefully get it back into useable state.
+         */
+        if (ip->i_flags & XFS_IRECLAIMABLE) {
                xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
                /*
-                 * We need to re-initialise the VFS inode as it has been
+                 * We need to set XFS_INEW atomically with clearing the
-                 * 'freed' by the VFS. Do this here so we can deal with
+                 * reclaimable tag so that we do have an indicator of the
-                 * errors cleanly, then tag it so it can be set up correctly
+                 * inode still being initialized.
-                 * later.
                 */
-                if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+                ip->i_flags |= XFS_INEW;
-                        error = ENOMEM;
+                ip->i_flags &= ~XFS_IRECLAIMABLE;
-                        goto out_error;
+                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
-                }
-                /*
+                spin_unlock(&ip->i_flags_lock);
-                 * We must set the XFS_INEW flag before clearing the
+                read_unlock(&pag->pag_ici_lock);
-                 * XFS_IRECLAIMABLE flag so that if a racing lookup does
-                 * not find the XFS_IRECLAIMABLE above but has the igrab()
-                 * below succeed we can safely check XFS_INEW to detect
-                 * that this inode is still being initialised.
-                 */
-                xfs_iflags_set(ip, XFS_INEW);
-                xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-                /* clear the radix tree reclaim flag as well. */
+                error = -inode_init_always(mp->m_super, inode);
-                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+                if (error) {
-        } else if (!igrab(VFS_I(ip))) {
+                        /*
+                         * Re-initializing the inode failed, and we are in deep
+                         * trouble.  Try to re-add it to the reclaim list.
+                         */
+                        read_lock(&pag->pag_ici_lock);
+                        spin_lock(&ip->i_flags_lock);
+                        ip->i_flags &= ~XFS_INEW;
+                        ip->i_flags |= XFS_IRECLAIMABLE;
+                        __xfs_inode_set_reclaim_tag(pag, ip);
+                        goto out_error;
+                }
+                inode->i_state = I_LOCK|I_NEW;
+        } else {
                /* If the VFS inode is being torn down, pause and try again. */
-                XFS_STATS_INC(xs_ig_frecycle);
+                if (!igrab(inode)) {
-                goto out_error;
+                        error = EAGAIN;
-        } else if (xfs_iflags_test(ip, XFS_INEW)) {
+                        goto out_error;
-                /*
+                }
-                 * We are racing with another cache hit that is
-                 * currently recycling this inode out of the XFS_IRECLAIMABLE
-                 * state. Wait for the initialisation to complete before
-                 * continuing.
-                 */
-                wait_on_inode(VFS_I(ip));
-        }
-        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+                /* We've got a live one. */
-                error = ENOENT;
+                spin_unlock(&ip->i_flags_lock);
-                iput(VFS_I(ip));
+                read_unlock(&pag->pag_ici_lock);
-                goto out_error;
        }
-        /* We've got a live one. */
-        read_unlock(&pag->pag_ici_lock);
        if (lock_flags != 0)
                xfs_ilock(ip, lock_flags);
@@ -215,6 +276,7 @@ xfs_iget_cache_hit(
        return 0;
 out_error:
+        spin_unlock(&ip->i_flags_lock);
        read_unlock(&pag->pag_ici_lock);
        return error;
 }
@@ -298,7 +360,8 @@ out_preload_end:
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
 out_destroy:
-        xfs_destroy_inode(ip);
+        __destroy_inode(VFS_I(ip));
+        xfs_inode_free(ip);
        return error;
 }
@@ -500,68 +563,10 @@ xfs_ireclaim(
         * ilock one but will still hold the iolock.
         */
        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        /*
+        xfs_qm_dqdetach(ip);
-         * Release dquots (and their references) if any.
-         */
-        XFS_QM_DQDETACH(ip->i_mount, ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        switch (ip->i_d.di_mode & S_IFMT) {
+        xfs_inode_free(ip);
-        case S_IFREG:
-        case S_IFDIR:
-        case S_IFLNK:
-                xfs_idestroy_fork(ip, XFS_DATA_FORK);
-                break;
-        }
-        if (ip->i_afp)
-                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-#ifdef XFS_INODE_TRACE
-        ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BTREE_TRACE
-        ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-        ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(ip->i_dir_trace);
-#endif
-        if (ip->i_itemp) {
-                /*
-                 * Only if we are shutting down the fs will we see an
-                 * inode still in the AIL. If it is there, we should remove
-                 * it to prevent a use-after-free from occurring.
-                 */
-                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
-                struct xfs_ail  *ailp = lip->li_ailp;
-                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
-                if (lip->li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&ailp->xa_lock);
-                        if (lip->li_flags & XFS_LI_IN_AIL)
-                                xfs_trans_ail_delete(ailp, lip);
-                        else
-                                spin_unlock(&ailp->xa_lock);
-                }
-                xfs_inode_item_destroy(ip);
-                ip->i_itemp = NULL;
-        }
-        /* asserts to verify all state is correct here */
-        ASSERT(atomic_read(&ip->i_iocount) == 0);
-        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
 }
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 123b20c8cbf2..da428b3fe0f5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -49,7 +49,6 @@
 #include "xfs_utils.h"
 #include "xfs_dir2_trace.h"
 #include "xfs_quota.h"
-#include "xfs_acl.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
@@ -344,6 +343,16 @@ xfs_iformat(
                return XFS_ERROR(EFSCORRUPTED);
        }
+        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+                     !ip->i_mount->m_rtdev_targp)) {
+                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        "corrupt dinode %Lu, has realtime flag set.",
+                        ip->i_ino);
+                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+                                     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
        switch (ip->i_d.di_mode & S_IFMT) {
        case S_IFIFO:
        case S_IFCHR:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f879c1bc4b96..65f24a3cc992 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -18,6 +18,7 @@
 #ifndef __XFS_INODE_H__
 #define __XFS_INODE_H__
+struct posix_acl;
 struct xfs_dinode;
 struct xfs_inode;
@@ -309,23 +310,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 }
 /*
- * Get rid of a partially initialized inode.
- *
- * We have to go through destroy_inode to make sure allocations
- * from init_inode_always like the security data are undone.
- *
- * We mark the inode bad so that it takes the short cut in
- * the reclaim path instead of going through the flush path
- * which doesn't make sense for an inode that has never seen the
- * light of day.
- */
-static inline void xfs_destroy_inode(struct xfs_inode *ip)
-{
-        make_bad_inode(VFS_I(ip));
-        return destroy_inode(VFS_I(ip));
-}
-/*
 * i_flags helper functions
 */
 static inline void
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5aaa2d7ec155..67ae5555a30a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -385,7 +384,7 @@ xfs_iomap_write_direct(
         * Make sure that the dquots are there. This doesn't hold
         * the ilock across a disk read.
         */
-        error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
+        error = xfs_qm_dqattach_locked(ip, 0);
        if (error)
                return XFS_ERROR(error);
@@ -444,8 +443,7 @@ xfs_iomap_write_direct(
        if (error)
                goto error_out;
-        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
+        error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
-                                              qblocks, 0, quota_flag);
        if (error)
                goto error1;
@@ -495,7 +493,7 @@ xfs_iomap_write_direct(
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
        xfs_bmap_cancel(&free_list);
-        XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
+        xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -582,7 +580,7 @@ xfs_iomap_write_delay(
         * Make sure that the dquots are there. This doesn't hold
         * the ilock across a disk read.
         */
-        error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+        error = xfs_qm_dqattach_locked(ip, 0);
        if (error)
                return XFS_ERROR(error);
@@ -684,7 +682,8 @@ xfs_iomap_write_allocate(
        /*
         * Make sure that the dquots are there.
         */
-        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+        error = xfs_qm_dqattach(ip, 0);
+        if (error)
                return XFS_ERROR(error);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3750f04ede0b..9dbdff3ea484 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3180,7 +3180,7 @@ try_again:
 STATIC void
 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 {
-        ASSERT(spin_is_locked(&log->l_icloglock));
+        assert_spin_locked(&log->l_icloglock);
        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
                xlog_state_switch_iclogs(log, iclog, 0);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7ba450116d4f..47da2fb45377 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1975,16 +1975,30 @@ xlog_recover_do_reg_buffer(
                error = 0;
                if (buf_f->blf_flags &
                   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                        if (item->ri_buf[i].i_addr == NULL) {
+                                cmn_err(CE_ALERT,
+                                        "XFS: NULL dquot in %s.", __func__);
+                                goto next;
+                        }
+                        if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) {
+                                cmn_err(CE_ALERT,
+                                        "XFS: dquot too small (%d) in %s.",
+                                        item->ri_buf[i].i_len, __func__);
+                                goto next;
+                        }
                        error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
                                               item->ri_buf[i].i_addr,
                                               -1, 0, XFS_QMOPT_DOWARN,
                                               "dquot_buf_recover");
+                        if (error)
+                                goto next;
                }
-                if (!error)
-                        memcpy(xfs_buf_offset(bp,
+                memcpy(xfs_buf_offset(bp,
-                                (uint)bit << XFS_BLI_SHIFT),    /* dest */
+                        (uint)bit << XFS_BLI_SHIFT),    /* dest */
-                                item->ri_buf[i].i_addr,         /* source */
+                        item->ri_buf[i].i_addr,         /* source */
-                                nbits<<XFS_BLI_SHIFT);          /* length */
+                        nbits<<XFS_BLI_SHIFT);          /* length */
+ next:
                i++;
                bit += nbits;
        }
@@ -2615,7 +2629,19 @@ xlog_recover_do_dquot_trans(
                return (0);
        recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
-        ASSERT(recddq);
+        if (item->ri_buf[1].i_addr == NULL) {
+                cmn_err(CE_ALERT,
+                        "XFS: NULL dquot in %s.", __func__);
+                return XFS_ERROR(EIO);
+        }
+        if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) {
+                cmn_err(CE_ALERT,
+                        "XFS: dquot too small (%d) in %s.",
+                        item->ri_buf[1].i_len, __func__);
+                return XFS_ERROR(EIO);
+        }
        /*
         * This type of quotas was turned off, so ignore this record.
         */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 65a99725d0cc..5c6f092659c1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -960,6 +960,53 @@ xfs_check_sizes(xfs_mount_t *mp)
 }
 /*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+int
+xfs_mount_reset_sbqflags(
+        struct xfs_mount        *mp)
+{
+        int                     error;
+        struct xfs_trans        *tp;
+        mp->m_qflags = 0;
+        /*
+         * It is OK to look at sb_qflags here in mount path,
+         * without m_sb_lock.
+         */
+        if (mp->m_sb.sb_qflags == 0)
+                return 0;
+        spin_lock(&mp->m_sb_lock);
+        mp->m_sb.sb_qflags = 0;
+        spin_unlock(&mp->m_sb_lock);
+        /*
+         * If the fs is readonly, let the incore superblock run
+         * with quotas off but don't flush the update out to disk
+         */
+        if (mp->m_flags & XFS_MOUNT_RDONLY)
+                return 0;
+#ifdef QUOTADEBUG
+        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+#endif
+        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                      XFS_DEFAULT_LOG_COUNT);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                xfs_fs_cmn_err(CE_ALERT, mp,
+                        "xfs_mount_reset_sbqflags: Superblock update failed!");
+                return error;
+        }
+        xfs_mod_sb(tp, XFS_SB_QFLAGS);
+        return xfs_trans_commit(tp, 0);
+}
+/*
 * This function does the following on an initial mount of a file system:
 *      - reads the superblock from disk and init the mount struct
 *      - if we're a 32-bit kernel, do a size check on the superblock
@@ -976,7 +1023,8 @@ xfs_mountfs(
        xfs_sb_t        *sbp = &(mp->m_sb);
        xfs_inode_t     *rip;
        __uint64_t      resblks;
-        uint            quotamount, quotaflags;
+        uint            quotamount = 0;
+        uint            quotaflags = 0;
        int             error = 0;
        xfs_mount_common(mp, sbp);
@@ -1210,9 +1258,28 @@ xfs_mountfs(
        /*
         * Initialise the XFS quota management subsystem for this mount
         */
-        error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
+        if (XFS_IS_QUOTA_RUNNING(mp)) {
-        if (error)
+                error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
-                goto out_rtunmount;
+                if (error)
+                        goto out_rtunmount;
+        } else {
+                ASSERT(!XFS_IS_QUOTA_ON(mp));
+                /*
+                 * If a file system had quotas running earlier, but decided to
+                 * mount without -o uquota/pquota/gquota options, revoke the
+                 * quotachecked license.
+                 */
+                if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
+                        cmn_err(CE_NOTE,
+                                "XFS: resetting qflags for filesystem %s",
+                                mp->m_fsname);
+                        error = xfs_mount_reset_sbqflags(mp);
+                        if (error)
+                                return error;
+                }
+        }
        /*
         * Finish recovering the file system.  This part needed to be
@@ -1228,9 +1295,19 @@ xfs_mountfs(
        /*
         * Complete the quota initialisation, post-log-replay component.
         */
-        error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
+        if (quotamount) {
-        if (error)
+                ASSERT(mp->m_qflags == 0);
-                goto out_rtunmount;
+                mp->m_qflags = quotaflags;
+                xfs_qm_mount_quotas(mp);
+        }
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+        if (XFS_IS_QUOTA_ON(mp))
+                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
+        else
+                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
+#endif
        /*
         * Now we are mounted, reserve a small amount of unused space for
@@ -1279,12 +1356,7 @@ xfs_unmountfs(
        __uint64_t              resblks;
        int                     error;
-        /*
+        xfs_qm_unmount_quotas(mp);
-         * Release dquot that rootinode, rbmino and rsumino might be holding,
-         * and release the quota inodes.
-         */
-        XFS_QM_UNMOUNT(mp);
        xfs_rtunmount_inodes(mp);
        IRELE(mp->m_rootip);
@@ -1299,12 +1371,9 @@ xfs_unmountfs(
         * need to force the log first.
         */
        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
+        xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
-        XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
-        if (mp->m_quotainfo)
+        xfs_qm_unmount(mp);
-                XFS_QM_DONE(mp);
        /*
         * Flush out the log synchronously so that we know for sure
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d6a64392f983..a5122382afde 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -64,6 +64,8 @@ struct xfs_swapext;
 struct xfs_mru_cache;
 struct xfs_nameops;
 struct xfs_ail;
+struct xfs_quotainfo;
 /*
 * Prototypes and functions for the Data Migration subsystem.
@@ -107,86 +109,6 @@ typedef struct xfs_dmops {
        (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
-/*
- * Prototypes and functions for the Quota Management subsystem.
- */
-struct xfs_dquot;
-struct xfs_dqtrxops;
-struct xfs_quotainfo;
-typedef int     (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
-typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
-typedef void    (*xfs_qmunmount_t)(struct xfs_mount *);
-typedef void    (*xfs_qmdone_t)(struct xfs_mount *);
-typedef void    (*xfs_dqrele_t)(struct xfs_dquot *);
-typedef int     (*xfs_dqattach_t)(struct xfs_inode *, uint);
-typedef void    (*xfs_dqdetach_t)(struct xfs_inode *);
-typedef int     (*xfs_dqpurgeall_t)(struct xfs_mount *, uint);
-typedef int     (*xfs_dqvopalloc_t)(struct xfs_mount *,
-                        struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-                        struct xfs_dquot **, struct xfs_dquot **);
-typedef void    (*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *,
-                        struct xfs_dquot *, struct xfs_dquot *);
-typedef int     (*xfs_dqvoprename_t)(struct xfs_inode **);
-typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
-                        struct xfs_trans *, struct xfs_inode *,
-                        struct xfs_dquot **, struct xfs_dquot *);
-typedef int     (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
-                        struct xfs_dquot *, struct xfs_dquot *, uint);
-typedef void    (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
-typedef int     (*xfs_dqsync_t)(struct xfs_mount *, int flags);
-typedef struct xfs_qmops {
-        xfs_qminit_t            xfs_qminit;
-        xfs_qmdone_t            xfs_qmdone;
-        xfs_qmmount_t           xfs_qmmount;
-        xfs_qmunmount_t         xfs_qmunmount;
-        xfs_dqrele_t            xfs_dqrele;
-        xfs_dqattach_t          xfs_dqattach;
-        xfs_dqdetach_t          xfs_dqdetach;
-        xfs_dqpurgeall_t        xfs_dqpurgeall;
-        xfs_dqvopalloc_t        xfs_dqvopalloc;
-        xfs_dqvopcreate_t       xfs_dqvopcreate;
-        xfs_dqvoprename_t       xfs_dqvoprename;
-        xfs_dqvopchown_t        xfs_dqvopchown;
-        xfs_dqvopchownresv_t    xfs_dqvopchownresv;
-        xfs_dqstatvfs_t         xfs_dqstatvfs;
-        xfs_dqsync_t            xfs_dqsync;
-        struct xfs_dqtrxops     *xfs_dqtrxops;
-} xfs_qmops_t;
-#define XFS_QM_INIT(mp, mnt, fl) \
-        (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
-#define XFS_QM_MOUNT(mp, mnt, fl) \
-        (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
-#define XFS_QM_UNMOUNT(mp) \
-        (*(mp)->m_qm_ops->xfs_qmunmount)(mp)
-#define XFS_QM_DONE(mp) \
-        (*(mp)->m_qm_ops->xfs_qmdone)(mp)
-#define XFS_QM_DQRELE(mp, dq) \
-        (*(mp)->m_qm_ops->xfs_dqrele)(dq)
-#define XFS_QM_DQATTACH(mp, ip, fl) \
-        (*(mp)->m_qm_ops->xfs_dqattach)(ip, fl)
-#define XFS_QM_DQDETACH(mp, ip) \
-        (*(mp)->m_qm_ops->xfs_dqdetach)(ip)
-#define XFS_QM_DQPURGEALL(mp, fl) \
-        (*(mp)->m_qm_ops->xfs_dqpurgeall)(mp, fl)
-#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, prid, fl, dq1, dq2) \
-        (*(mp)->m_qm_ops->xfs_dqvopalloc)(mp, ip, uid, gid, prid, fl, dq1, dq2)
-#define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \
-        (*(mp)->m_qm_ops->xfs_dqvopcreate)(tp, ip, dq1, dq2)
-#define XFS_QM_DQVOPRENAME(mp, ip) \
-        (*(mp)->m_qm_ops->xfs_dqvoprename)(ip)
-#define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \
-        (*(mp)->m_qm_ops->xfs_dqvopchown)(tp, ip, dqp, dq)
-#define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \
-        (*(mp)->m_qm_ops->xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl)
-#define XFS_QM_DQSTATVFS(ip, statp) \
-        (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
-#define XFS_QM_DQSYNC(mp, flags) \
-        (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
 #ifdef HAVE_PERCPU_SB
 /*
@@ -510,8 +432,6 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dmops_get(struct xfs_mount *);
 extern void     xfs_dmops_put(struct xfs_mount *);
-extern int      xfs_qmops_get(struct xfs_mount *);
-extern void     xfs_qmops_put(struct xfs_mount *);
 extern struct xfs_dmops xfs_dmcore_xfs;
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
deleted file mode 100644
index e101790ea8e7..000000000000
--- a/fs/xfs/xfs_qmops.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_quota.h"
-#include "xfs_error.h"
-STATIC struct xfs_dquot *
-xfs_dqvopchown_default(
-        struct xfs_trans        *tp,
-        struct xfs_inode        *ip,
-        struct xfs_dquot        **dqp,
-        struct xfs_dquot        *dq)
-{
-        return NULL;
-}
-/*
- * Clear the quotaflags in memory and in the superblock.
- */
-int
-xfs_mount_reset_sbqflags(xfs_mount_t *mp)
-{
-        int                     error;
-        xfs_trans_t             *tp;
-        mp->m_qflags = 0;
-        /*
-         * It is OK to look at sb_qflags here in mount path,
-         * without m_sb_lock.
-         */
-        if (mp->m_sb.sb_qflags == 0)
-                return 0;
-        spin_lock(&mp->m_sb_lock);
-        mp->m_sb.sb_qflags = 0;
-        spin_unlock(&mp->m_sb_lock);
-        /*
-         * if the fs is readonly, let the incore superblock run
-         * with quotas off but don't flush the update out to disk
-         */
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
-                return 0;
-#ifdef QUOTADEBUG
-        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
-#endif
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-        if ((error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-                                      XFS_DEFAULT_LOG_COUNT))) {
-                xfs_trans_cancel(tp, 0);
-                xfs_fs_cmn_err(CE_ALERT, mp,
-                        "xfs_mount_reset_sbqflags: Superblock update failed!");
-                return error;
-        }
-        xfs_mod_sb(tp, XFS_SB_QFLAGS);
-        error = xfs_trans_commit(tp, 0);
-        return error;
-}
-STATIC int
-xfs_noquota_init(
-        xfs_mount_t     *mp,
-        uint            *needquotamount,
-        uint            *quotaflags)
-{
-        int             error = 0;
-        *quotaflags = 0;
-        *needquotamount = B_FALSE;
-        ASSERT(!XFS_IS_QUOTA_ON(mp));
-        /*
-         * If a file system had quotas running earlier, but decided to
-         * mount without -o uquota/pquota/gquota options, revoke the
-         * quotachecked license.
-         */
-        if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
-                cmn_err(CE_NOTE,
-                        "XFS resetting qflags for filesystem %s",
-                        mp->m_fsname);
-                error = xfs_mount_reset_sbqflags(mp);
-        }
-        return error;
-}
-static struct xfs_qmops xfs_qmcore_stub = {
-        .xfs_qminit             = (xfs_qminit_t) xfs_noquota_init,
-        .xfs_qmdone             = (xfs_qmdone_t) fs_noerr,
-        .xfs_qmmount            = (xfs_qmmount_t) fs_noerr,
-        .xfs_qmunmount          = (xfs_qmunmount_t) fs_noerr,
-        .xfs_dqrele             = (xfs_dqrele_t) fs_noerr,
-        .xfs_dqattach           = (xfs_dqattach_t) fs_noerr,
-        .xfs_dqdetach           = (xfs_dqdetach_t) fs_noerr,
-        .xfs_dqpurgeall         = (xfs_dqpurgeall_t) fs_noerr,
-        .xfs_dqvopalloc         = (xfs_dqvopalloc_t) fs_noerr,
-        .xfs_dqvopcreate        = (xfs_dqvopcreate_t) fs_noerr,
-        .xfs_dqvoprename        = (xfs_dqvoprename_t) fs_noerr,
-        .xfs_dqvopchown         = xfs_dqvopchown_default,
-        .xfs_dqvopchownresv     = (xfs_dqvopchownresv_t) fs_noerr,
-        .xfs_dqstatvfs          = (xfs_dqstatvfs_t) fs_noval,
-        .xfs_dqsync             = (xfs_dqsync_t) fs_noerr,
-};
-int
-xfs_qmops_get(struct xfs_mount *mp)
-{
-        if (XFS_IS_QUOTA_RUNNING(mp)) {
-#ifdef CONFIG_XFS_QUOTA
-                mp->m_qm_ops = &xfs_qmcore_xfs;
-#else
-                cmn_err(CE_WARN,
-                        "XFS: qouta support not available in this kernel.");
-                return EINVAL;
-#endif
-        } else {
-                mp->m_qm_ops = &xfs_qmcore_stub;
-        }
-        return 0;
-}
-void
-xfs_qmops_put(struct xfs_mount *mp)
-{
-}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index f5d1202dde25..3ec91ac74c2a 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -197,7 +197,6 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_UMOUNTING     0x0000100 /* filesys is being unmounted */
 #define XFS_QMOPT_DOLOG         0x0000200 /* log buf changes (in quotacheck) */
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
-#define XFS_QMOPT_ILOCKED       0x0000800 /* inode is already locked (excl) */
 #define XFS_QMOPT_DQREPAIR      0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA        0x0002000 /* group dquot requested */
 #define XFS_QMOPT_ENOSPC        0x0004000 /* enospc instead of edquot (prj) */
@@ -302,69 +301,79 @@ typedef struct xfs_dqtrx {
        long            qt_delrtb_delta;  /* delayed RT blk count changes */
 } xfs_dqtrx_t;
-/*
+#ifdef CONFIG_XFS_QUOTA
- * Dquot transaction functions, used if quota is enabled.
+extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
- */
+extern void xfs_trans_free_dqinfo(struct xfs_trans *);
-typedef void    (*qo_dup_dqinfo_t)(struct xfs_trans *, struct xfs_trans *);
+extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
-typedef void    (*qo_mod_dquot_byino_t)(struct xfs_trans *,
+                uint, long);
-                                struct xfs_inode *, uint, long);
+extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-typedef void    (*qo_free_dqinfo_t)(struct xfs_trans *);
+extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
-typedef void    (*qo_apply_dquot_deltas_t)(struct xfs_trans *);
+extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
-typedef void    (*qo_unreserve_and_mod_dquots_t)(struct xfs_trans *);
+                struct xfs_inode *, long, long, uint);
-typedef int     (*qo_reserve_quota_nblks_t)(
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
-                                struct xfs_trans *, struct xfs_mount *,
+                struct xfs_mount *, struct xfs_dquot *,
-                                struct xfs_inode *, long, long, uint);
+                struct xfs_dquot *, long, long, uint);
-typedef int     (*qo_reserve_quota_bydquots_t)(
-                                struct xfs_trans *, struct xfs_mount *,
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-                                struct xfs_dquot *, struct xfs_dquot *,
+                struct xfs_dquot **, struct xfs_dquot **);
-                                long, long, uint);
+extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
-typedef struct xfs_dqtrxops {
+                struct xfs_dquot *, struct xfs_dquot *);
-        qo_dup_dqinfo_t                 qo_dup_dqinfo;
+extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
-        qo_free_dqinfo_t                qo_free_dqinfo;
+extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
-        qo_mod_dquot_byino_t            qo_mod_dquot_byino;
+                struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
-        qo_apply_dquot_deltas_t         qo_apply_dquot_deltas;
+extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
-        qo_reserve_quota_nblks_t        qo_reserve_quota_nblks;
+                struct xfs_dquot *, struct xfs_dquot *, uint);
-        qo_reserve_quota_bydquots_t     qo_reserve_quota_bydquots;
+extern int xfs_qm_dqattach(struct xfs_inode *, uint);
-        qo_unreserve_and_mod_dquots_t   qo_unreserve_and_mod_dquots;
+extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
-} xfs_dqtrxops_t;
+extern void xfs_qm_dqdetach(struct xfs_inode *);
+extern void xfs_qm_dqrele(struct xfs_dquot *);
-#define XFS_DQTRXOP(mp, tp, op, args...) \
+extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
-                ((mp)->m_qm_ops->xfs_dqtrxops ? \
+extern int xfs_qm_sync(struct xfs_mount *, int);
-                ((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : 0)
+extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
+extern void xfs_qm_mount_quotas(struct xfs_mount *);
-#define XFS_DQTRXOP_VOID(mp, tp, op, args...) \
+extern void xfs_qm_unmount(struct xfs_mount *);
-                ((mp)->m_qm_ops->xfs_dqtrxops ? \
+extern void xfs_qm_unmount_quotas(struct xfs_mount *);
-                ((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : (void)0)
+#else
-#define XFS_TRANS_DUP_DQINFO(mp, otp, ntp) \
+static inline int
-        XFS_DQTRXOP_VOID(mp, otp, qo_dup_dqinfo, ntp)
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
-#define XFS_TRANS_FREE_DQINFO(mp, tp) \
+                uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
-        XFS_DQTRXOP_VOID(mp, tp, qo_free_dqinfo)
+{
-#define XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, field, delta) \
+        *udqp = NULL;
-        XFS_DQTRXOP_VOID(mp, tp, qo_mod_dquot_byino, ip, field, delta)
+        *gdqp = NULL;
-#define XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp) \
+        return 0;
-        XFS_DQTRXOP_VOID(mp, tp, qo_apply_dquot_deltas)
+}
-#define XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, fl) \
+#define xfs_trans_dup_dqinfo(tp, tp2)
-        XFS_DQTRXOP(mp, tp, qo_reserve_quota_nblks, mp, ip, nblks, ninos, fl)
+#define xfs_trans_free_dqinfo(tp)
-#define XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, fl) \
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
-        XFS_DQTRXOP(mp, tp, qo_reserve_quota_bydquots, mp, ud, gd, nb, ni, fl)
+#define xfs_trans_apply_dquot_deltas(tp)
-#define XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp) \
+#define xfs_trans_unreserve_and_mod_dquots(tp)
-        XFS_DQTRXOP_VOID(mp, tp, qo_unreserve_and_mod_dquots)
+#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)      (0)
+#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)      (0)
-#define XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, flags) \
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
-        XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), -(ninos), flags)
+#define xfs_qm_vop_rename_dqattach(it)                                  (0)
-#define XFS_TRANS_RESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
+#define xfs_qm_vop_chown(tp, ip, old, new)                              (NULL)
-        XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, \
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl)                      (0)
-                                f | XFS_QMOPT_RES_REGBLKS)
+#define xfs_qm_dqattach(ip, fl)                                         (0)
-#define XFS_TRANS_UNRESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
+#define xfs_qm_dqattach_locked(ip, fl)                                  (0)
-        XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, -(nb), -(ni), \
+#define xfs_qm_dqdetach(ip)
+#define xfs_qm_dqrele(d)
+#define xfs_qm_statvfs(ip, s)
+#define xfs_qm_sync(mp, fl)                                             (0)
+#define xfs_qm_newmount(mp, a, b)                                       (0)
+#define xfs_qm_mount_quotas(mp)
+#define xfs_qm_unmount(mp)
+#define xfs_qm_unmount_quotas(mp)                                       (0)
+#endif /* CONFIG_XFS_QUOTA */
+#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
+        xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
+        xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
                                f | XFS_QMOPT_RES_REGBLKS)
 extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
-extern struct xfs_qmops xfs_qmcore_xfs;
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 58f85e9cd11d..b81deea0ce19 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -166,7 +166,8 @@ xfs_rename(
        /*
         * Attach the dquots to the inodes
         */
-        if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) {
+        error = xfs_qm_vop_rename_dqattach(inodes);
+        if (error) {
                xfs_trans_cancel(tp, cancel_flags);
                goto std_return;
        }
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 36f3a21c54d2..fea68615ed23 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -41,7 +41,6 @@
 #include "xfs_ialloc.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
-#include "xfs_acl.h"
 #include "xfs_error.h"
 #include "xfs_buf_item.h"
 #include "xfs_rw.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8570b826fedd..66b849358e62 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -297,7 +297,7 @@ xfs_trans_dup(
        tp->t_rtx_res = tp->t_rtx_res_used;
        ntp->t_pflags = tp->t_pflags;
-        XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp);
+        xfs_trans_dup_dqinfo(tp, ntp);
        atomic_inc(&tp->t_mountp->m_active_trans);
        return ntp;
@@ -628,8 +628,6 @@ xfs_trans_apply_sb_deltas(
                xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
                                  offsetof(xfs_dsb_t, sb_frextents) +
                                  sizeof(sbp->sb_frextents) - 1);
-        tp->t_mountp->m_super->s_dirt = 1;
 }
 /*
@@ -831,7 +829,7 @@ shut_us_down:
                 * means is that we have some (non-persistent) quota
                 * reservations that need to be unreserved.
                 */
-                XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
+                xfs_trans_unreserve_and_mod_dquots(tp);
                if (tp->t_ticket) {
                        commit_lsn = xfs_log_done(mp, tp->t_ticket,
                                                        NULL, log_flags);
@@ -850,10 +848,9 @@ shut_us_down:
        /*
         * If we need to update the superblock, then do it now.
         */
-        if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
                xfs_trans_apply_sb_deltas(tp);
-        }
+        xfs_trans_apply_dquot_deltas(tp);
-        XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp);
        /*
         * Ask each log item how many log_vector entries it will
@@ -1058,7 +1055,7 @@ xfs_trans_uncommit(
        }
        xfs_trans_unreserve_and_mod_sb(tp);
-        XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp);
+        xfs_trans_unreserve_and_mod_dquots(tp);
        xfs_trans_free_items(tp, flags);
        xfs_trans_free_busy(tp);
@@ -1183,7 +1180,7 @@ xfs_trans_cancel(
        }
 #endif
        xfs_trans_unreserve_and_mod_sb(tp);
-        XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
+        xfs_trans_unreserve_and_mod_dquots(tp);
        if (tp->t_ticket) {
                if (flags & XFS_TRANS_RELEASE_LOG_RES) {
@@ -1213,7 +1210,7 @@ xfs_trans_free(
        xfs_trans_t     *tp)
 {
        atomic_dec(&tp->t_mountp->m_active_trans);
-        XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
+        xfs_trans_free_dqinfo(tp);
        kmem_zone_free(xfs_trans_zone, tp);
 }
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 79b9e5ea5359..4d88616bde91 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -166,7 +166,7 @@ xfs_dir_ialloc(
                        xfs_buf_relse(ialloc_context);
                        if (dqinfo) {
                                tp->t_dqinfo = dqinfo;
-                                XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
+                                xfs_trans_free_dqinfo(tp);
                        }
                        *tpp = ntp;
                        *ipp = NULL;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 19cf90a9c762..492d75bae2bf 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -42,6 +42,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
+#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_rw.h"
 #include "xfs_error.h"
@@ -118,7 +119,7 @@ xfs_setattr(
                 */
                ASSERT(udqp == NULL);
                ASSERT(gdqp == NULL);
-                code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid,
+                code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
                                         qflags, &udqp, &gdqp);
                if (code)
                        return code;
@@ -180,10 +181,11 @@ xfs_setattr(
                 * Do a quota reservation only if uid/gid is actually
                 * going to change.
                 */
-                if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+                if (XFS_IS_QUOTA_RUNNING(mp) &&
-                    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
+                    ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+                     (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
                        ASSERT(tp);
-                        code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
                                                capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (code)       /* out of quota */
@@ -217,7 +219,7 @@ xfs_setattr(
                /*
                 * Make sure that the dquots are attached to the inode.
                 */
-                code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+                code = xfs_qm_dqattach_locked(ip, 0);
                if (code)
                        goto error_return;
@@ -351,21 +353,21 @@ xfs_setattr(
                 * in the transaction.
                 */
                if (iuid != uid) {
-                        if (XFS_IS_UQUOTA_ON(mp)) {
+                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
                                ASSERT(mask & ATTR_UID);
                                ASSERT(udqp);
-                                olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+                                olddquot1 = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_udquot, udqp);
                        }
                        ip->i_d.di_uid = uid;
                        inode->i_uid = uid;
                }
                if (igid != gid) {
-                        if (XFS_IS_GQUOTA_ON(mp)) {
+                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
                                ASSERT(!XFS_IS_PQUOTA_ON(mp));
                                ASSERT(mask & ATTR_GID);
                                ASSERT(gdqp);
-                                olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+                                olddquot2 = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_gdquot, gdqp);
                        }
                        ip->i_d.di_gid = gid;
@@ -461,13 +463,25 @@ xfs_setattr(
        /*
         * Release any dquot(s) the inode had kept before chown.
         */
-        XFS_QM_DQRELE(mp, olddquot1);
+        xfs_qm_dqrele(olddquot1);
-        XFS_QM_DQRELE(mp, olddquot2);
+        xfs_qm_dqrele(olddquot2);
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
-        if (code) {
+        if (code)
                return code;
+        /*
+         * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
+         *           update.  We could avoid this with linked transactions
+         *           and passing down the transaction pointer all the way
+         *           to attr_set.  No previous user of the generic
+         *           Posix ACL code seems to care about this issue either.
+         */
+        if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
+                code = -xfs_acl_chmod(inode);
+                if (code)
+                        return XFS_ERROR(code);
        }
        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
@@ -482,8 +496,8 @@ xfs_setattr(
        commit_flags |= XFS_TRANS_ABORT;
        /* FALLTHROUGH */
 error_return:
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        if (tp) {
                xfs_trans_cancel(tp, commit_flags);
        }
@@ -524,7 +538,9 @@ xfs_readlink_bmap(
                d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
                byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+                bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+                                        XBF_LOCK | XBF_MAPPED |
+                                        XBF_DONT_BLOCK);
                error = XFS_BUF_GETERROR(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_readlink",
@@ -739,7 +755,8 @@ xfs_free_eofblocks(
                /*
                 * Attach the dquots to the inode up front.
                 */
-                if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+                error = xfs_qm_dqattach(ip, 0);
+                if (error)
                        return error;
                /*
@@ -1181,7 +1198,8 @@ xfs_inactive(
        ASSERT(ip->i_d.di_nlink == 0);
-        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+        error = xfs_qm_dqattach(ip, 0);
+        if (error)
                return VN_INACTIVE_CACHE;
        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
@@ -1307,7 +1325,7 @@ xfs_inactive(
                /*
                 * Credit the quota account(s). The inode is gone.
                 */
-                XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+                xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
                /*
                 * Just ignore errors at this point.  There is nothing we can
@@ -1323,11 +1341,11 @@ xfs_inactive(
                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
                                "xfs_trans_commit() returned error %d", error);
        }
        /*
         * Release the dquots held by inode, if any.
         */
-        XFS_QM_DQDETACH(mp, ip);
+        xfs_qm_dqdetach(ip);
        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 out:
@@ -1427,8 +1445,7 @@ xfs_create(
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
-        error = XFS_QM_DQVOPALLOC(mp, dp,
+        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -1489,7 +1506,7 @@ xfs_create(
        /*
         * Reserve disk quota and the inode.
         */
-        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
+        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
        if (error)
                goto out_trans_cancel;
@@ -1561,7 +1578,7 @@ xfs_create(
         * These ids of the inode couldn't have changed since the new
         * inode has been locked ever since it was created.
         */
-        XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
+        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
        /*
         * xfs_trans_commit normally decrements the vnode ref count
@@ -1580,8 +1597,8 @@ xfs_create(
                goto out_dqrele;
        }
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        *ipp = ip;
@@ -1602,8 +1619,8 @@ xfs_create(
 out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
 out_dqrele:
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -1837,11 +1854,11 @@ xfs_remove(
                        return error;
        }
-        error = XFS_QM_DQATTACH(mp, dp, 0);
+        error = xfs_qm_dqattach(dp, 0);
        if (error)
                goto std_return;
-        error = XFS_QM_DQATTACH(mp, ip, 0);
+        error = xfs_qm_dqattach(ip, 0);
        if (error)
                goto std_return;
@@ -2028,11 +2045,11 @@ xfs_link(
        /* Return through std_return after this point. */
-        error = XFS_QM_DQATTACH(mp, sip, 0);
+        error = xfs_qm_dqattach(sip, 0);
        if (error)
                goto std_return;
-        error = XFS_QM_DQATTACH(mp, tdp, 0);
+        error = xfs_qm_dqattach(tdp, 0);
        if (error)
                goto std_return;
@@ -2205,8 +2222,7 @@ xfs_symlink(
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
-        error = XFS_QM_DQVOPALLOC(mp, dp,
+        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -2248,7 +2264,7 @@ xfs_symlink(
        /*
         * Reserve disk quota : blocks and inode.
         */
-        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
+        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
        if (error)
                goto error_return;
@@ -2288,7 +2304,7 @@ xfs_symlink(
        /*
         * Also attach the dquot(s) to it, if applicable.
         */
-        XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
+        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
        if (resblks)
                resblks -= XFS_IALLOC_SPACE_RES(mp);
@@ -2376,8 +2392,8 @@ xfs_symlink(
                goto error2;
        }
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        /* Fall through to std_return with error = 0 or errno from
         * xfs_trans_commit     */
@@ -2401,8 +2417,8 @@ std_return:
        cancel_flags |= XFS_TRANS_ABORT;
 error_return:
        xfs_trans_cancel(tp, cancel_flags);
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -2541,7 +2557,8 @@ xfs_alloc_file_space(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+        error = xfs_qm_dqattach(ip, 0);
+        if (error)
                return error;
        if (len <= 0)
@@ -2628,8 +2645,8 @@ retry:
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
+                error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
-                                                      qblocks, 0, quota_flag);
+                                                      0, quota_flag);
                if (error)
                        goto error1;
@@ -2688,7 +2705,7 @@ dmapi_enospc_check:
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
        xfs_bmap_cancel(&free_list);
-        XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
+        xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -2827,7 +2844,8 @@ xfs_free_file_space(
        xfs_itrace_entry(ip);
-        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+        error = xfs_qm_dqattach(ip, 0);
+        if (error)
                return error;
        error = 0;
@@ -2953,9 +2971,9 @@ xfs_free_file_space(
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
+                error = xfs_trans_reserve_quota(tp, mp,
-                                ip->i_udquot, ip->i_gdquot, resblks, 0,
+                                ip->i_udquot, ip->i_gdquot,
-                                XFS_QMOPT_RES_REGBLKS);
+                                resblks, 0, XFS_QMOPT_RES_REGBLKS);
                if (error)
                        goto error1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 04373c6c61ff..a9e102de71a1 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
 #define XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK         0x04    /* Don't grab any conflicting locks */
+#define XFS_ATTR_NOACL          0x08    /* Don't call xfs_acl_chmod */
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_fsync(struct xfs_inode *ip);
author	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2009-09-14 00:16:56 -0400
committer	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2009-09-14 00:16:56 -0400
commit	fc8e1ead9314cf0e0f1922e661428b93d3a50d88 (patch)
tree	f3cb97c4769b74f6627a59769f1ed5c92a13c58a /fs
parent	2bcaa6a4238094c5695d5b1943078388d82d3004 (diff)
parent	9de48cc300fb10f7d9faa978670becf5e352462a (diff)