449 files changed, 21562 insertions, 13761 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 7e0511476797..814ac4e213a8 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -9,6 +9,8 @@ config 9P_FS
          If unsure, say N.
+if 9P_FS
 config 9P_FSCACHE
        bool "Enable 9P client caching support (EXPERIMENTAL)"
        depends on EXPERIMENTAL
@@ -20,7 +22,6 @@ config 9P_FSCACHE
 config 9P_FS_POSIX_ACL
        bool "9P POSIX Access Control Lists"
-        depends on 9P_FS
        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
@@ -30,3 +31,5 @@ config 9P_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
          If you don't know what Access Control Lists are, say N
+endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index f8ba37effd1b..ab8c12780634 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
 9p-objs := \
        vfs_super.o \
        vfs_inode.o \
+        vfs_inode_dotl.o \
        vfs_addr.o \
        vfs_file.o \
        vfs_dir.o \
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 6e58c4ca1e6e..02a2cf616318 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -28,7 +28,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
        ssize_t size;
        void *value = NULL;
-        struct posix_acl *acl = NULL;;
+        struct posix_acl *acl = NULL;
        size = v9fs_fid_xattr_get(fid, name, NULL, 0);
        if (size > 0) {
@@ -365,7 +365,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
                if (!S_ISDIR(inode->i_mode)) {
-                        retval = -EINVAL;
+                        retval = acl ? -EINVAL : 0;
                        goto err_out;
                }
                break;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index cb6396855e2d..c4b5d8864f0d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -113,9 +113,27 @@ struct v9fs_session_info {
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
-void v9fs_session_close(struct v9fs_session_info *v9ses);
+extern void v9fs_session_close(struct v9fs_session_info *v9ses);
-void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
-void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+                        struct nameidata *nameidata);
+extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry);
+extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
+                        void *p);
+extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
+extern const struct inode_operations v9fs_dir_inode_operations_dotl;
+extern const struct inode_operations v9fs_file_inode_operations_dotl;
+extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
 /* other default globals */
 #define V9FS_PORT       564
@@ -138,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 {
        return v9ses->flags & V9FS_PROTO_2000L;
 }
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                                struct super_block *sb)
+{
+        if (v9fs_proto_dotl(v9ses))
+                return v9fs_inode_dotl(v9ses, fid, sb);
+        else
+                return v9fs_inode(v9ses, fid, sb);
+}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index bab0eac873f4..b789f8e597ec 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,7 +59,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
-void v9fs_dentry_release(struct dentry *);
 int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 466d2a4fc5cb..233b7d4ffe5e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -86,7 +86,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 *
 */
-void v9fs_dentry_release(struct dentry *dentry)
+static void v9fs_dentry_release(struct dentry *dentry)
 {
        struct v9fs_dentry *dent;
        struct p9_fid *temp, *current_fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 59782981b225..b76a40bdf4c2 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -49,15 +49,8 @@
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
-static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
-static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
-static const struct inode_operations v9fs_symlink_inode_operations_dotl;
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                    dev_t rdev);
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -251,41 +244,6 @@ void v9fs_destroy_inode(struct inode *inode)
 #endif
 /**
- * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
- * new file system object. This checks the S_ISGID to determine the owning
- * group of the new file system object.
- */
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
-{
-        BUG_ON(dir_inode == NULL);
-        if (dir_inode->i_mode & S_ISGID) {
-                /* set_gid bit is set.*/
-                return dir_inode->i_gid;
-        }
-        return current_fsgid();
-}
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
-{
-        struct dentry *dentry;
-        spin_lock(&inode->i_lock);
-        /* Directory should have only one entry. */
-        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        spin_unlock(&inode->i_lock);
-        return dentry;
-}
-/**
 * v9fs_get_inode - helper function to setup an inode
 * @sb: superblock
 * @mode: mode to setup inode with
@@ -454,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode)
 #endif
 }
-static struct inode *
+struct inode *
 v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        struct super_block *sb)
 {
@@ -489,60 +447,6 @@ error:
        return ERR_PTR(err);
 }
-static struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-        struct super_block *sb)
-{
-        struct inode *ret = NULL;
-        int err;
-        struct p9_stat_dotl *st;
-        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-        if (IS_ERR(st))
-                return ERR_CAST(st);
-        ret = v9fs_get_inode(sb, st->st_mode);
-        if (IS_ERR(ret)) {
-                err = PTR_ERR(ret);
-                goto error;
-        }
-        v9fs_stat2inode_dotl(st, ret);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
-#endif
-        err = v9fs_get_acl(ret, fid);
-        if (err) {
-                iput(ret);
-                goto error;
-        }
-        kfree(st);
-        return ret;
-error:
-        kfree(st);
-        return ERR_PTR(err);
-}
-/**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
- * issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-                        struct super_block *sb)
-{
-        if (v9fs_proto_dotl(v9ses))
-                return v9fs_inode_dotl(v9ses, fid, sb);
-        else
-                return v9fs_inode(v9ses, fid, sb);
-}
 /**
 * v9fs_remove - helper function to remove files and directories
 * @dir: directory inode that is being deleted
@@ -633,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        if (v9ses->cache)
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-        else
-                d_set_d_op(dentry, &v9fs_dentry_operations);
        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
@@ -657,144 +555,6 @@ error:
 }
 /**
- * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
- * @dir: directory inode that is being created
- * @dentry:  dentry that is being deleted
- * @mode: create permissions
- * @nd: path information
- *
- */
-static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                struct nameidata *nd)
-{
-        int err = 0;
-        char *name = NULL;
-        gid_t gid;
-        int flags;
-        mode_t mode;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL;
-        struct p9_fid *dfid, *ofid;
-        struct file *filp;
-        struct p9_qid qid;
-        struct inode *inode;
-        struct posix_acl *pacl = NULL, *dacl = NULL;
-        v9ses = v9fs_inode2v9ses(dir);
-        if (nd && nd->flags & LOOKUP_OPEN)
-                flags = nd->intent.open.flags - 1;
-        else {
-                /*
-                 * create call without LOOKUP_OPEN is due
-                 * to mknod of regular files. So use mknod
-                 * operation.
-                 */
-                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
-        }
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-                        "mode:0x%x\n", name, flags, omode);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        /* clone a fid to use for creation */
-        ofid = p9_client_walk(dfid, 0, NULL, 1);
-        if (IS_ERR(ofid)) {
-                err = PTR_ERR(ofid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in creat %d\n", err);
-                goto error;
-        }
-        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                                "p9_client_open_dotl failed in creat %d\n",
-                                err);
-                goto error;
-        }
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
-            (nd && nd->flags & LOOKUP_OPEN)) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                /* The fid would get clunked via a dput */
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate
-                 * inode with stat. We need to get an inode
-                 * so that we can set the acl with dentry
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_dentry_operations);
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-        /* if we are opening a file, assign the open fid to the file */
-        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
-                if (IS_ERR(filp)) {
-                        p9_client_clunk(ofid);
-                        return PTR_ERR(filp);
-                }
-                filp->private_data = ofid;
-        } else
-                p9_client_clunk(ofid);
-        return 0;
-error:
-        if (ofid)
-                p9_client_clunk(ofid);
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_create - VFS hook to create files
 * @dir: directory inode that is being created
 * @dentry:  dentry that is being deleted
@@ -884,107 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return err;
 }
-/**
- * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
- * @dir:  inode that is being unlinked
- * @dentry: dentry that is being unlinked
- * @mode: mode for new directory
- *
- */
-static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-                               struct dentry *dentry, int omode)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        gid_t gid;
-        char *name;
-        mode_t mode;
-        struct inode *inode;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
-        err = 0;
-        v9ses = v9fs_inode2v9ses(dir);
-        omode |= S_IFDIR;
-        if (dir->i_mode & S_ISGID)
-                omode |= S_ISGID;
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in mkdir %d\n", err);
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate
-                 * inode with stat. We need to get an inode
-                 * so that we can set the acl with dentry
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_dentry_operations);
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
 /**
 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
 * @dir:  inode that is being walked from
@@ -993,7 +652,7 @@ error:
 *
 */
-static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nameidata)
 {
        struct super_block *sb;
@@ -1040,11 +699,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                goto error_iput;
 inst_out:
-        if (v9ses->cache)
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-        else
-                d_set_d_op(dentry, &v9fs_dentry_operations);
        d_add(dentry, inode);
        return NULL;
@@ -1063,7 +717,7 @@ error:
 *
 */
-static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 0);
 }
@@ -1075,7 +729,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 *
 */
-static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 1);
 }
@@ -1089,7 +743,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 *
 */
-static int
+int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -1196,42 +850,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
-static int
-v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
-                 struct kstat *stat)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_stat_dotl *st;
-        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-        err = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-                return simple_getattr(mnt, dentry, stat);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        /* Ask for all the fields in stat structure. Server will return
-         * whatever it supports
-         */
-        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
-        if (IS_ERR(st))
-                return PTR_ERR(st);
-        v9fs_stat2inode_dotl(st, dentry->d_inode);
-        generic_fillattr(dentry->d_inode, stat);
-        /* Change block size to what the server returned */
-        stat->blksize = st->st_blksize;
-        kfree(st);
-        return 0;
-}
 /**
 * v9fs_vfs_setattr - set file metadata
 * @dentry: file whose metadata to set
@@ -1291,64 +909,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 /**
- * v9fs_vfs_setattr_dotl - set file metadata
- * @dentry: file whose metadata to set
- * @iattr: metadata assignment structure
- *
- */
-int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
-{
-        int retval;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_iattr_dotl p9attr;
-        P9_DPRINTK(P9_DEBUG_VFS, "\n");
-        retval = inode_change_ok(dentry->d_inode, iattr);
-        if (retval)
-                return retval;
-        p9attr.valid = iattr->ia_valid;
-        p9attr.mode = iattr->ia_mode;
-        p9attr.uid = iattr->ia_uid;
-        p9attr.gid = iattr->ia_gid;
-        p9attr.size = iattr->ia_size;
-        p9attr.atime_sec = iattr->ia_atime.tv_sec;
-        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
-        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
-        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
-        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        retval = p9_client_setattr(fid, &p9attr);
-        if (retval < 0)
-                return retval;
-        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(dentry->d_inode)) {
-                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
-                if (retval)
-                        return retval;
-        }
-        setattr_copy(dentry->d_inode, iattr);
-        mark_inode_dirty(dentry->d_inode);
-        if (iattr->ia_valid & ATTR_MODE) {
-                /* We also want to update ACL when we update mode bits */
-                retval = v9fs_acl_chmod(dentry);
-                if (retval < 0)
-                        return retval;
-        }
-        return 0;
-}
-/**
 * v9fs_stat2inode - populate an inode structure with mistat info
 * @stat: Plan 9 metadata (mistat) structure
 * @inode: inode to populate
@@ -1426,77 +986,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 /**
- * v9fs_stat2inode_dotl - populate an inode structure with stat info
- * @stat: stat structure
- * @inode: inode to populate
- * @sb: superblock of filesystem
- *
- */
-void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
-{
-        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
-                inode->i_atime.tv_sec = stat->st_atime_sec;
-                inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                inode->i_uid = stat->st_uid;
-                inode->i_gid = stat->st_gid;
-                inode->i_nlink = stat->st_nlink;
-                inode->i_mode = stat->st_mode;
-                inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
-                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
-                i_size_write(inode, stat->st_size);
-                inode->i_blocks = stat->st_blocks;
-        } else {
-                if (stat->st_result_mask & P9_STATS_ATIME) {
-                        inode->i_atime.tv_sec = stat->st_atime_sec;
-                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_MTIME) {
-                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_CTIME) {
-                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_UID)
-                        inode->i_uid = stat->st_uid;
-                if (stat->st_result_mask & P9_STATS_GID)
-                        inode->i_gid = stat->st_gid;
-                if (stat->st_result_mask & P9_STATS_NLINK)
-                        inode->i_nlink = stat->st_nlink;
-                if (stat->st_result_mask & P9_STATS_MODE) {
-                        inode->i_mode = stat->st_mode;
-                        if ((S_ISBLK(inode->i_mode)) ||
-                                                (S_ISCHR(inode->i_mode)))
-                                init_special_inode(inode, inode->i_mode,
-                                                                inode->i_rdev);
-                }
-                if (stat->st_result_mask & P9_STATS_RDEV)
-                        inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if (stat->st_result_mask & P9_STATS_SIZE)
-                        i_size_write(inode, stat->st_size);
-                if (stat->st_result_mask & P9_STATS_BLOCKS)
-                        inode->i_blocks = stat->st_blocks;
-        }
-        if (stat->st_result_mask & P9_STATS_GEN)
-                        inode->i_generation = stat->st_gen;
-        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
-         * because the inode structure does not have fields for them.
-         */
-}
-/**
 * v9fs_qid2ino - convert qid into inode number
 * @qid: qid to hash
 *
@@ -1602,7 +1091,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 *
 */
-static void
+void
 v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
        char *s = nd_get_link(nd);
@@ -1646,94 +1135,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 }
 /**
- * v9fs_vfs_symlink_dotl - helper function to create symlinks
- * @dir: directory inode containing symlink
- * @dentry: dentry for symlink
- * @symname: symlink data
- *
- * See Also: 9P2000.L RFC for more information
- *
- */
-static int
-v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
-                const char *symname)
-{
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *dfid;
-        struct p9_fid *fid = NULL;
-        struct inode *inode;
-        struct p9_qid qid;
-        char *name;
-        int err;
-        gid_t gid;
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
-                        dir->i_ino, name, symname);
-        v9ses = v9fs_inode2v9ses(dir);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
-        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
-                goto error;
-        }
-        if (v9ses->cache) {
-                /* Now walk from the parent so we can get an unopened fid. */
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                        err);
-                        fid = NULL;
-                        goto error;
-                }
-                /* instantiate inode and assign the unopened fid to dentry */
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                        err);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /* Not in cached mode. No need to populate inode with stat */
-                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_dentry_operations);
-                d_instantiate(dentry, inode);
-        }
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_symlink - helper function to create symlinks
 * @dir: directory inode containing symlink
 * @dentry: dentry for symlink
@@ -1792,77 +1193,6 @@ clunk_fid:
 }
 /**
- * v9fs_vfs_link_dotl - create a hardlink for dotl
- * @old_dentry: dentry for file to link to
- * @dir: inode destination for new link
- * @dentry: dentry for link
- *
- */
-static int
-v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
-                struct dentry *dentry)
-{
-        int err;
-        struct p9_fid *dfid, *oldfid;
-        char *name;
-        struct v9fs_session_info *v9ses;
-        struct dentry *dir_dentry;
-        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
-                        dir->i_ino, old_dentry->d_name.name,
-                        dentry->d_name.name);
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid))
-                return PTR_ERR(dfid);
-        oldfid = v9fs_fid_lookup(old_dentry);
-        if (IS_ERR(oldfid))
-                return PTR_ERR(oldfid);
-        name = (char *) dentry->d_name.name;
-        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
-                return err;
-        }
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                /* Get the latest stat info from server. */
-                struct p9_fid *fid;
-                struct p9_stat_dotl *st;
-                fid = v9fs_fid_lookup(old_dentry);
-                if (IS_ERR(fid))
-                        return PTR_ERR(fid);
-                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-                if (IS_ERR(st))
-                        return PTR_ERR(st);
-                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-                kfree(st);
-        } else {
-                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just hold the
-                 * inode
-                 */
-                ihold(old_dentry->d_inode);
-        }
-        d_set_d_op(dentry, old_dentry->d_op);
-        d_instantiate(dentry, old_dentry->d_inode);
-        return err;
-}
-/**
 * v9fs_vfs_mknod - create a special file
 * @dir: inode destination for new link
 * @dentry: dentry for file
@@ -1907,160 +1237,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
-/**
- * v9fs_vfs_mknod_dotl - create a special file
- * @dir: inode destination for new link
- * @dentry: dentry for file
- * @mode: mode for creation
- * @rdev: device associated with special file
- *
- */
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                dev_t rdev)
-{
-        int err;
-        char *name;
-        mode_t mode;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        struct inode *inode;
-        gid_t gid;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS,
-                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
-        if (!new_valid_dev(rdev))
-                return -EINVAL;
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in mknod %d\n", err);
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate inode with stat.
-                 * socket syscall returns a fd, so we need instantiate
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_dentry_operations);
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-static int
-v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
-{
-        int retval;
-        struct p9_fid *fid;
-        char *target = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
-        retval = -EPERM;
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        retval = p9_client_readlink(fid, &target);
-        if (retval < 0)
-                return retval;
-        strncpy(buffer, target, buflen);
-        P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
-        retval = strnlen(buffer, buflen);
-        return retval;
-}
-/**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- *
- */
-static void *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
-{
-        int len = 0;
-        char *link = __getname();
-        P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
-        if (!link)
-                link = ERR_PTR(-ENOMEM);
-        else {
-                len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
-                if (len < 0) {
-                        __putname(link);
-                        link = ERR_PTR(len);
-                } else
-                        link[min(len, PATH_MAX-1)] = 0;
-        }
-        nd_set_link(nd, link);
-        return NULL;
-}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -2075,25 +1251,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_dir_inode_operations_dotl = {
-        .create = v9fs_vfs_create_dotl,
-        .lookup = v9fs_vfs_lookup,
-        .link = v9fs_vfs_link_dotl,
-        .symlink = v9fs_vfs_symlink_dotl,
-        .unlink = v9fs_vfs_unlink,
-        .mkdir = v9fs_vfs_mkdir_dotl,
-        .rmdir = v9fs_vfs_rmdir,
-        .mknod = v9fs_vfs_mknod_dotl,
-        .rename = v9fs_vfs_rename,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-        .check_acl = v9fs_check_acl,
-};
 static const struct inode_operations v9fs_dir_inode_operations = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -2111,16 +1268,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_file_inode_operations_dotl = {
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-        .check_acl = v9fs_check_acl,
-};
 static const struct inode_operations v9fs_symlink_inode_operations = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
@@ -2129,14 +1276,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-        .readlink = v9fs_vfs_readlink_dotl,
-        .follow_link = v9fs_vfs_follow_link_dotl,
-        .put_link = v9fs_vfs_put_link,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 000000000000..fe3ffa9aace4
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,824 @@
+/*
+ *  linux/fs/9p/vfs_inode_dotl.c
+ *
+ * This file contains vfs inode ops for the 9P2000.L protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                    dev_t rdev);
+/**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+        BUG_ON(dir_inode == NULL);
+        if (dir_inode->i_mode & S_ISGID) {
+                /* set_gid bit is set.*/
+                return dir_inode->i_gid;
+        }
+        return current_fsgid();
+}
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+        struct dentry *dentry;
+        spin_lock(&inode->i_lock);
+        /* Directory should have only one entry. */
+        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+        spin_unlock(&inode->i_lock);
+        return dentry;
+}
+struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+        struct super_block *sb)
+{
+        struct inode *ret = NULL;
+        int err;
+        struct p9_stat_dotl *st;
+        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        ret = v9fs_get_inode(sb, st->st_mode);
+        if (IS_ERR(ret)) {
+                err = PTR_ERR(ret);
+                goto error;
+        }
+        v9fs_stat2inode_dotl(st, ret);
+        ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_cache_inode_get_cookie(ret);
+#endif
+        err = v9fs_get_acl(ret, fid);
+        if (err) {
+                iput(ret);
+                goto error;
+        }
+        kfree(st);
+        return ret;
+error:
+        kfree(st);
+        return ERR_PTR(err);
+}
+/**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                struct nameidata *nd)
+{
+        int err = 0;
+        char *name = NULL;
+        gid_t gid;
+        int flags;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL;
+        struct p9_fid *dfid, *ofid;
+        struct file *filp;
+        struct p9_qid qid;
+        struct inode *inode;
+        struct posix_acl *pacl = NULL, *dacl = NULL;
+        v9ses = v9fs_inode2v9ses(dir);
+        if (nd && nd->flags & LOOKUP_OPEN)
+                flags = nd->intent.open.flags - 1;
+        else {
+                /*
+                 * create call without LOOKUP_OPEN is due
+                 * to mknod of regular files. So use mknod
+                 * operation.
+                 */
+                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+        }
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+                        "mode:0x%x\n", name, flags, omode);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        /* clone a fid to use for creation */
+        ofid = p9_client_walk(dfid, 0, NULL, 1);
+        if (IS_ERR(ofid)) {
+                err = PTR_ERR(ofid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in creat %d\n", err);
+                goto error;
+        }
+        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                                "p9_client_open_dotl failed in creat %d\n",
+                                err);
+                goto error;
+        }
+        /* instantiate inode and assign the unopened fid to the dentry */
+        fid = p9_client_walk(dfid, 1, &name, 1);
+        if (IS_ERR(fid)) {
+                err = PTR_ERR(fid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                fid = NULL;
+                goto error;
+        }
+        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+                goto error;
+        }
+        d_instantiate(dentry, inode);
+        err = v9fs_fid_add(dentry, fid);
+        if (err < 0)
+                goto error;
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+        /* Since we are opening a file, assign the open fid to the file */
+        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
+        if (IS_ERR(filp)) {
+                p9_client_clunk(ofid);
+                return PTR_ERR(filp);
+        }
+        filp->private_data = ofid;
+        return 0;
+error:
+        if (ofid)
+                p9_client_clunk(ofid);
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+                               struct dentry *dentry, int omode)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        gid_t gid;
+        char *name;
+        mode_t mode;
+        struct inode *inode;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        err = 0;
+        v9ses = v9fs_inode2v9ses(dir);
+        omode |= S_IFDIR;
+        if (dir->i_mode & S_ISGID)
+                omode |= S_ISGID;
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mkdir %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate
+                 * inode with stat. We need to get an inode
+                 * so that we can set the acl with dentry
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_stat_dotl *st;
+        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+        err = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+                return simple_getattr(mnt, dentry, stat);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        /* Ask for all the fields in stat structure. Server will return
+         * whatever it supports
+         */
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        v9fs_stat2inode_dotl(st, dentry->d_inode);
+        generic_fillattr(dentry->d_inode, stat);
+        /* Change block size to what the server returned */
+        stat->blksize = st->st_blksize;
+        kfree(st);
+        return 0;
+}
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+        int retval;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_iattr_dotl p9attr;
+        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        retval = inode_change_ok(dentry->d_inode, iattr);
+        if (retval)
+                return retval;
+        p9attr.valid = iattr->ia_valid;
+        p9attr.mode = iattr->ia_mode;
+        p9attr.uid = iattr->ia_uid;
+        p9attr.gid = iattr->ia_gid;
+        p9attr.size = iattr->ia_size;
+        p9attr.atime_sec = iattr->ia_atime.tv_sec;
+        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+        retval = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        retval = p9_client_setattr(fid, &p9attr);
+        if (retval < 0)
+                return retval;
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(dentry->d_inode)) {
+                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                if (retval)
+                        return retval;
+        }
+        setattr_copy(dentry->d_inode, iattr);
+        mark_inode_dirty(dentry->d_inode);
+        if (iattr->ia_valid & ATTR_MODE) {
+                /* We also want to update ACL when we update mode bits */
+                retval = v9fs_acl_chmod(dentry);
+                if (retval < 0)
+                        return retval;
+        }
+        return 0;
+}
+/**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+                inode->i_atime.tv_sec = stat->st_atime_sec;
+                inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                inode->i_uid = stat->st_uid;
+                inode->i_gid = stat->st_gid;
+                inode->i_nlink = stat->st_nlink;
+                inode->i_mode = stat->st_mode;
+                inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                i_size_write(inode, stat->st_size);
+                inode->i_blocks = stat->st_blocks;
+        } else {
+                if (stat->st_result_mask & P9_STATS_ATIME) {
+                        inode->i_atime.tv_sec = stat->st_atime_sec;
+                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_MTIME) {
+                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_CTIME) {
+                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_UID)
+                        inode->i_uid = stat->st_uid;
+                if (stat->st_result_mask & P9_STATS_GID)
+                        inode->i_gid = stat->st_gid;
+                if (stat->st_result_mask & P9_STATS_NLINK)
+                        inode->i_nlink = stat->st_nlink;
+                if (stat->st_result_mask & P9_STATS_MODE) {
+                        inode->i_mode = stat->st_mode;
+                        if ((S_ISBLK(inode->i_mode)) ||
+                                                (S_ISCHR(inode->i_mode)))
+                                init_special_inode(inode, inode->i_mode,
+                                                                inode->i_rdev);
+                }
+                if (stat->st_result_mask & P9_STATS_RDEV)
+                        inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if (stat->st_result_mask & P9_STATS_SIZE)
+                        i_size_write(inode, stat->st_size);
+                if (stat->st_result_mask & P9_STATS_BLOCKS)
+                        inode->i_blocks = stat->st_blocks;
+        }
+        if (stat->st_result_mask & P9_STATS_GEN)
+                        inode->i_generation = stat->st_gen;
+        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+         * because the inode structure does not have fields for them.
+         */
+}
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+                const char *symname)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct inode *inode;
+        struct p9_qid qid;
+        char *name;
+        int err;
+        gid_t gid;
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+                        dir->i_ino, name, symname);
+        v9ses = v9fs_inode2v9ses(dir);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+                goto error;
+        }
+        if (v9ses->cache) {
+                /* Now walk from the parent so we can get an unopened fid. */
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                        err);
+                        fid = NULL;
+                        goto error;
+                }
+                /* instantiate inode and assign the unopened fid to dentry */
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                        err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /* Not in cached mode. No need to populate inode with stat */
+                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *dfid, *oldfid;
+        char *name;
+        struct v9fs_session_info *v9ses;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+                        dir->i_ino, old_dentry->d_name.name,
+                        dentry->d_name.name);
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid))
+                return PTR_ERR(dfid);
+        oldfid = v9fs_fid_lookup(old_dentry);
+        if (IS_ERR(oldfid))
+                return PTR_ERR(oldfid);
+        name = (char *) dentry->d_name.name;
+        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+                return err;
+        }
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                /* Get the latest stat info from server. */
+                struct p9_fid *fid;
+                struct p9_stat_dotl *st;
+                fid = v9fs_fid_lookup(old_dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                if (IS_ERR(st))
+                        return PTR_ERR(st);
+                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+                kfree(st);
+        } else {
+                /* Caching disabled. No need to get upto date stat info.
+                 * This dentry will be released immediately. So, just hold the
+                 * inode
+                 */
+                ihold(old_dentry->d_inode);
+        }
+        d_instantiate(dentry, old_dentry->d_inode);
+        return err;
+}
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                dev_t rdev)
+{
+        int err;
+        char *name;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        struct inode *inode;
+        gid_t gid;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS,
+                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mknod %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate inode with stat.
+                 * socket syscall returns a fd, so we need instantiate
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+        int retval;
+        struct p9_fid *fid;
+        char *link = __getname();
+        char *target;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+        if (!link) {
+                link = ERR_PTR(-ENOMEM);
+                goto ndset;
+        }
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid)) {
+                __putname(link);
+                link = ERR_PTR(PTR_ERR(fid));
+                goto ndset;
+        }
+        retval = p9_client_readlink(fid, &target);
+        if (!retval) {
+                strcpy(link, target);
+                kfree(target);
+                goto ndset;
+        }
+        __putname(link);
+        link = ERR_PTR(retval);
+ndset:
+        nd_set_link(nd, link);
+        return NULL;
+}
+const struct inode_operations v9fs_dir_inode_operations_dotl = {
+        .create = v9fs_vfs_create_dotl,
+        .lookup = v9fs_vfs_lookup,
+        .link = v9fs_vfs_link_dotl,
+        .symlink = v9fs_vfs_symlink_dotl,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir_dotl,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod_dotl,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_file_inode_operations_dotl = {
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+        .readlink = generic_readlink,
+        .follow_link = v9fs_vfs_follow_link_dotl,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index c55c614500ad..dbaabe3b8131 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -141,6 +141,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
        }
        v9fs_fill_super(sb, v9ses, flags, data);
+        if (v9ses->cache)
+                sb->s_d_op = &v9fs_cached_dentry_operations;
+        else
+                sb->s_d_op = &v9fs_dentry_operations;
        inode = v9fs_get_inode(sb, S_IFDIR | mode);
        if (IS_ERR(inode)) {
                retval = PTR_ERR(inode);
@@ -217,9 +222,6 @@ static void v9fs_kill_super(struct super_block *s)
        P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
-        if (s->s_root)
-                v9fs_dentry_release(s->s_root); /* clunk root */
        kill_anon_super(s);
        v9fs_session_cancel(v9ses);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 43ec7df84336..d288773871b3 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -133,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
                        "p9_client_xattrcreate failed %d\n", retval);
                goto error;
        }
-        msize = fid->clnt->msize;;
+        msize = fid->clnt->msize;
        while (value_len) {
                if (value_len > (msize - P9_IOHDRSZ))
                        write_count = msize - P9_IOHDRSZ;
diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457402d4..3db9caa57edc 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
-config FS_POSIX_ACL
-# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
-#
-# NOTE: you can implement Posix ACLs without these helpers (XFS does).
-#       Never use this symbol for ifdefs.
-#
-        bool
-        default n
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
@@ -47,11 +38,19 @@ source "fs/nilfs2/Kconfig"
 endif # BLOCK
+# Posix ACL utility routines
+#
+# Note: Posix ACLs can be implemented without these helpers.  Never use
+# this symbol for ifdefs in core code.
+#
+config FS_POSIX_ACL
+        def_bool n
 config EXPORTFS
        tristate
 config FILE_LOCKING
-        bool "Enable POSIX file locking API" if EMBEDDED
+        bool "Enable POSIX file locking API" if EXPERT
        default y
        help
          This option enables standard file locking support, required
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index bf7693c384f9..3b4a764ed780 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -276,7 +276,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct object_info obj;
        int error;
-        d_set_d_op(dentry, &adfs_dentry_operations);
        lock_kernel();
        error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
        if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index a4041b52fbca..2d7954049fbe 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -473,6 +473,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                asb->s_namelen = ADFS_F_NAME_LEN;
        }
+        sb->s_d_op = &adfs_dentry_operations;
        root = adfs_iget(sb, &root_obj);
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
@@ -483,8 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                kfree(asb->s_map);
                adfs_error(sb, "get root inode failed\n");
                goto error;
-        } else
+        }
-                d_set_d_op(sb->s_root, &adfs_dentry_operations);
        unlock_kernel();
        return 0;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a8cbdeb34025..0e95f73a7023 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -201,6 +201,7 @@ extern const struct address_space_operations	 affs_aops;
 extern const struct address_space_operations     affs_aops_ofs;
 extern const struct dentry_operations    affs_dentry_operations;
+extern const struct dentry_operations    affs_intl_dentry_operations;
 static inline void
 affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 944a4042fb65..e3e9efc1fdd8 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -32,7 +32,7 @@ const struct dentry_operations affs_dentry_operations = {
        .d_compare      = affs_compare_dentry,
 };
-static const struct dentry_operations affs_intl_dentry_operations = {
+const struct dentry_operations affs_intl_dentry_operations = {
        .d_hash         = affs_intl_hash_dentry,
        .d_compare      = affs_intl_compare_dentry,
 };
@@ -240,7 +240,6 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
-        d_set_d_op(dentry, AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations);
        d_add(dentry, inode);
        return NULL;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d39081bbe7ce..b31507d0f9b9 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -477,12 +477,16 @@ got_root:
                goto out_error_noinode;
        }
+        if (AFFS_SB(sb)->s_flags & SF_INTL)
+                sb->s_d_op = &affs_intl_dentry_operations;
+        else
+                sb->s_d_op = &affs_dentry_operations;
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root) {
                printk(KERN_ERR "AFFS: Get root inode failed\n");
                goto out_error;
        }
-        d_set_d_op(sb->s_root, &affs_dentry_operations);
        pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
        return 0;
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54a..1c8c6cc6de30 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
        call->server = server;
        INIT_WORK(&call->work, SRXAFSCB_CallBack);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
        call->server = server;
        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
        call->server = server;
        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
        call->state = AFS_CALL_REPLYING;
        INIT_WORK(&call->work, SRXAFSCB_Probe);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
        call->state = AFS_CALL_REPLYING;
        INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
        call->state = AFS_CALL_REPLYING;
        INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
-        schedule_work(&call->work);
+        queue_work(afs_wq, &call->work);
        return 0;
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 34a3263d60a4..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -62,10 +62,11 @@ const struct inode_operations afs_dir_inode_operations = {
        .setattr        = afs_setattr,
 };
-static const struct dentry_operations afs_fs_dentry_operations = {
+const struct dentry_operations afs_fs_dentry_operations = {
        .d_revalidate   = afs_d_revalidate,
        .d_delete       = afs_d_delete,
        .d_release      = afs_d_release,
+        .d_automount    = afs_d_automount,
 };
 #define AFS_DIR_HASHTBL_SIZE    128
@@ -582,8 +583,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
        }
 success:
-        d_set_d_op(dentry, &afs_fs_dentry_operations);
        d_add(dentry, inode);
        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
               fid.vnode,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c3..db66c5201474 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
        inode->i_generation     = 0;
        set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
-        inode->i_flags |= S_NOATIME;
+        set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+        inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
        unlock_new_inode(inode);
        _leave(" = %p", inode);
        return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6d4bc1c8ff60..5a9b6843bac1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -486,6 +486,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
 * dir.c
 */
 extern const struct inode_operations afs_dir_inode_operations;
+extern const struct dentry_operations afs_fs_dentry_operations;
 extern const struct file_operations afs_dir_file_operations;
 /*
@@ -576,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
 /*
 * main.c
 */
+extern struct workqueue_struct *afs_wq;
 extern struct afs_uuid afs_uuid;
 /*
@@ -590,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
 extern const struct inode_operations afs_autocell_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
+extern struct vfsmount *afs_d_automount(struct path *);
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
 extern void afs_mntpt_kill_timer(void);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b22..42dd2e499ed8 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 struct afs_uuid afs_uuid;
+struct workqueue_struct *afs_wq;
 /*
 * get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
        if (ret < 0)
                return ret;
+        /* create workqueue */
+        ret = -ENOMEM;
+        afs_wq = alloc_workqueue("afs", 0, 0);
+        if (!afs_wq)
+                return ret;
        /* register the /proc stuff */
        ret = afs_proc_init();
        if (ret < 0)
-                return ret;
+                goto error_proc;
 #ifdef CONFIG_AFS_FSCACHE
        /* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
 error_cache:
 #endif
        afs_proc_cleanup();
+error_proc:
+        destroy_workqueue(afs_wq);
        rcu_barrier();
        printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
        return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
        afs_purge_servers();
        afs_callback_update_kill();
        afs_vlocation_purge();
-        flush_scheduled_work();
+        destroy_workqueue(afs_wq);
        afs_cell_purge();
 #ifdef CONFIG_AFS_FSCACHE
        fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6153417caf57..aa59184151d0 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
                                       struct dentry *dentry,
                                       struct nameidata *nd);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
 static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 const struct file_operations afs_mntpt_file_operations = {
@@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = {
 const struct inode_operations afs_mntpt_inode_operations = {
        .lookup         = afs_mntpt_lookup,
-        .follow_link    = afs_mntpt_follow_link,
        .readlink       = page_readlink,
        .getattr        = afs_getattr,
 };
 const struct inode_operations afs_autocell_inode_operations = {
-        .follow_link    = afs_mntpt_follow_link,
        .getattr        = afs_getattr,
 };
@@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
                _debug("symlink is a mountpoint");
                spin_lock(&vnode->lock);
                set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+                vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
                spin_unlock(&vnode->lock);
        }
@@ -238,52 +236,24 @@ error_no_devname:
 }
 /*
- * follow a link from a mountpoint directory, thus causing it to be mounted
+ * handle an automount point
 */
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *afs_d_automount(struct path *path)
 {
        struct vfsmount *newmnt;
-        int err;
-        _enter("%p{%s},{%s:%p{%s},}",
+        _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
-               dentry,
-               dentry->d_name.name,
-               nd->path.mnt->mnt_devname,
-               dentry,
-               nd->path.dentry->d_name.name);
-        dput(nd->path.dentry);
-        nd->path.dentry = dget(dentry);
-        newmnt = afs_mntpt_do_automount(nd->path.dentry);
+        newmnt = afs_mntpt_do_automount(path->dentry);
-        if (IS_ERR(newmnt)) {
+        if (IS_ERR(newmnt))
-                path_put(&nd->path);
+                return newmnt;
-                return (void *)newmnt;
-        }
-        mntget(newmnt);
-        err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
-        switch (err) {
-        case 0:
-                path_put(&nd->path);
-                nd->path.mnt = newmnt;
-                nd->path.dentry = dget(newmnt->mnt_root);
-                schedule_delayed_work(&afs_mntpt_expiry_timer,
-                                      afs_mntpt_expiry_timeout * HZ);
-                break;
-        case -EBUSY:
-                /* someone else made a mount here whilst we were busy */
-                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path))
-                        ;
-                err = 0;
-        default:
-                mntput(newmnt);
-                break;
-        }
-        _leave(" = %d", err);
+        mntget(newmnt); /* prevent immediate expiration */
-        return ERR_PTR(err);
+        mnt_set_expiry(newmnt, &afs_vfsmounts);
+        queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+                           afs_mntpt_expiry_timeout * HZ);
+        _leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+        return newmnt;
 }
 /*
@@ -295,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
        if (!list_empty(&afs_vfsmounts)) {
                mark_mounts_for_expiry(&afs_vfsmounts);
-                schedule_delayed_work(&afs_mntpt_expiry_timer,
+                queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
-                                      afs_mntpt_expiry_timeout * HZ);
+                                   afs_mntpt_expiry_timeout * HZ);
        }
        _leave("");
@@ -310,6 +280,5 @@ void afs_mntpt_kill_timer(void)
        _enter("");
        ASSERT(list_empty(&afs_vfsmounts));
-        cancel_delayed_work(&afs_mntpt_expiry_timer);
+        cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
-        flush_scheduled_work();
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01f..e45a323aebb4 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
        if (!call) {
                /* its an incoming call for our callback service */
                skb_queue_tail(&afs_incoming_calls, skb);
-                schedule_work(&afs_collect_incoming_call_work);
+                queue_work(afs_wq, &afs_collect_incoming_call_work);
        } else {
                /* route the messages directly to the appropriate call */
                skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7bc..d59b7516e943 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
        if (atomic_read(&server->usage) == 0) {
                list_move_tail(&server->grave, &afs_server_graveyard);
                server->time_of_death = get_seconds();
-                schedule_delayed_work(&afs_server_reaper,
+                queue_delayed_work(afs_wq, &afs_server_reaper,
-                                      afs_server_timeout * HZ);
+                                   afs_server_timeout * HZ);
        }
        spin_unlock(&afs_server_graveyard_lock);
        _leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
                expiry = server->time_of_death + afs_server_timeout;
                if (expiry > now) {
                        delay = (expiry - now) * HZ;
-                        if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+                        if (!queue_delayed_work(afs_wq, &afs_server_reaper,
+                                                delay)) {
                                cancel_delayed_work(&afs_server_reaper);
-                                schedule_delayed_work(&afs_server_reaper,
+                                queue_delayed_work(afs_wq, &afs_server_reaper,
-                                                      delay);
+                                                   delay);
                        }
                        break;
                }
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
 {
        afs_server_timeout = 0;
        cancel_delayed_work(&afs_server_reaper);
-        schedule_delayed_work(&afs_server_reaper, 0);
+        queue_delayed_work(afs_wq, &afs_server_reaper, 0);
 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f901a9d7c111..fb240e8766d6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -336,6 +336,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
        if (!root)
                goto error;
+        sb->s_d_op = &afs_fs_dentry_operations;
        sb->s_root = root;
        _leave(" = 0");
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361d..431984d2e372 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
                _debug("buried");
                list_move_tail(&vl->grave, &afs_vlocation_graveyard);
                vl->time_of_death = get_seconds();
-                schedule_delayed_work(&afs_vlocation_reap,
+                queue_delayed_work(afs_wq, &afs_vlocation_reap,
-                                      afs_vlocation_timeout * HZ);
+                                   afs_vlocation_timeout * HZ);
                /* suspend updates on this record */
                if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
                if (expiry > now) {
                        delay = (expiry - now) * HZ;
                        _debug("delay %lu", delay);
-                        if (!schedule_delayed_work(&afs_vlocation_reap,
+                        if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
-                                                   delay)) {
+                                                delay)) {
                                cancel_delayed_work(&afs_vlocation_reap);
-                                schedule_delayed_work(&afs_vlocation_reap,
+                                queue_delayed_work(afs_wq, &afs_vlocation_reap,
-                                                      delay);
+                                                   delay);
                        }
                        break;
                }
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
        destroy_workqueue(afs_vlocation_update_worker);
        cancel_delayed_work(&afs_vlocation_reap);
-        schedule_delayed_work(&afs_vlocation_reap, 0);
+        queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
 }
 /*
diff --git a/fs/aio.c b/fs/aio.c
index 8c8f6c5b6d79..fc557a3be0a9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
        aio_wq = create_workqueue("aio");
        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
-        BUG_ON(!abe_pool);
+        BUG_ON(!aio_wq || !abe_pool);
        pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx)
        queue_delayed_work(aio_wq, &ctx->wq, timeout);
 }
-/*
- * aio_run_iocbs:
- *      Process all pending retries queued on the ioctx
- *      run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static inline void aio_run_iocbs(struct kioctx *ctx)
-{
-        int requeue;
-        spin_lock_irq(&ctx->ctx_lock);
-        requeue = __aio_run_iocbs(ctx);
-        spin_unlock_irq(&ctx->ctx_lock);
-        if (requeue)
-                aio_queue_work(ctx);
-}
 /*
- * just like aio_run_iocbs, but keeps running them until
+ * aio_run_all_iocbs:
- * the list stays empty
+ *      Process all pending retries queued on the ioctx
+ *      run list, and keep running them until the list
+ *      stays empty.
+ * Assumes it is operating within the aio issuer's mm context.
 */
 static inline void aio_run_all_iocbs(struct kioctx *ctx)
 {
@@ -1839,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
        long ret = -EINVAL;
        if (likely(ioctx)) {
-                if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
+                if (likely(min_nr <= nr && min_nr >= 0))
                        ret = read_events(ioctx, min_nr, nr, events, timeout);
                put_ioctx(ioctx);
        }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 5fd38112a6ca..c5567cb78432 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
 static const struct file_operations anon_inode_fops;
-static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
-                                int flags, const char *dev_name, void *data)
-{
-        return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
-}
 /*
 * anon_inodefs_dname() is called from d_path().
 */
@@ -41,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
                                dentry->d_name.name);
 }
+static const struct dentry_operations anon_inodefs_dentry_operations = {
+        .d_dname        = anon_inodefs_dname,
+};
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+                                int flags, const char *dev_name, void *data)
+{
+        return mount_pseudo(fs_type, "anon_inode:", NULL,
+                        &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+}
 static struct file_system_type anon_inode_fs_type = {
        .name           = "anon_inodefs",
        .mount          = anon_inodefs_mount,
        .kill_sb        = kill_anon_super,
 };
-static const struct dentry_operations anon_inodefs_dentry_operations = {
-        .d_dname        = anon_inodefs_dname,
-};
 /*
 * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -64,9 +66,9 @@ static const struct address_space_operations anon_aops = {
 };
 /**
- * anon_inode_getfd - creates a new file instance by hooking it up to an
+ * anon_inode_getfile - creates a new file instance by hooking it up to an
- *                    anonymous inode, and a dentry that describe the "class"
+ *                      anonymous inode, and a dentry that describe the "class"
- *                    of the file
+ *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
@@ -113,7 +115,6 @@ struct file *anon_inode_getfile(const char *name,
         */
        ihold(anon_inode_inode);
-        d_set_d_op(path.dentry, &anon_inodefs_dentry_operations);
        d_instantiate(path.dentry, anon_inode_inode);
        error = -ENFILE;
@@ -232,7 +233,7 @@ static int __init anon_inode_init(void)
        return 0;
 err_mntput:
-        mntput_long(anon_inode_mnt);
+        mntput(anon_inode_mnt);
 err_unregister_filesystem:
        unregister_filesystem(&anon_inode_fs_type);
 err_exit:
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0fffe1c24cec..54f923792728 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -88,18 +88,9 @@ struct autofs_info {
        uid_t uid;
        gid_t gid;
-        mode_t  mode;
-        size_t  size;
-        void (*free)(struct autofs_info *);
-        union {
-                const char *symlink;
-        } u;
 };
 #define AUTOFS_INF_EXPIRING     (1<<0) /* dentry is in the process of expiring */
-#define AUTOFS_INF_MOUNTPOINT   (1<<1) /* mountpoint status for direct expire */
 #define AUTOFS_INF_PENDING      (1<<2) /* dentry pending mount */
 struct autofs_wait_queue {
@@ -176,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
        return 0;
 }
-static inline void autofs4_copy_atime(struct file *src, struct file *dst)
+struct inode *autofs4_get_inode(struct super_block *, mode_t);
-{
-        dst->f_path.dentry->d_inode->i_atime =
-                src->f_path.dentry->d_inode->i_atime;
-        return;
-}
-struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
 void autofs4_free_ino(struct autofs_info *);
 /* Expiration */
@@ -212,16 +196,89 @@ void autofs_dev_ioctl_exit(void);
 extern const struct inode_operations autofs4_symlink_inode_operations;
 extern const struct inode_operations autofs4_dir_inode_operations;
-extern const struct inode_operations autofs4_root_inode_operations;
-extern const struct inode_operations autofs4_indirect_root_inode_operations;
-extern const struct inode_operations autofs4_direct_root_inode_operations;
 extern const struct file_operations autofs4_dir_operations;
 extern const struct file_operations autofs4_root_operations;
+extern const struct dentry_operations autofs4_dentry_operations;
+/* VFS automount flags management functions */
+static inline void __managed_dentry_set_automount(struct dentry *dentry)
+{
+        dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+}
+static inline void managed_dentry_set_automount(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_set_automount(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_clear_automount(struct dentry *dentry)
+{
+        dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
+}
+static inline void managed_dentry_clear_automount(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_clear_automount(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_set_transit(struct dentry *dentry)
+{
+        dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
+}
+static inline void managed_dentry_set_transit(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_set_transit(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_clear_transit(struct dentry *dentry)
+{
+        dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
+}
+static inline void managed_dentry_clear_transit(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_clear_transit(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_set_managed(struct dentry *dentry)
+{
+        dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+static inline void managed_dentry_set_managed(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_set_managed(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+static inline void __managed_dentry_clear_managed(struct dentry *dentry)
+{
+        dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+static inline void managed_dentry_clear_managed(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __managed_dentry_clear_managed(dentry);
+        spin_unlock(&dentry->d_lock);
+}
 /* Initializing function */
 int autofs4_fill_super(struct super_block *, void *, int);
-struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode);
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
+void autofs4_clean_ino(struct autofs_info *);
 /* Queue management functions */
@@ -229,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
-static inline int autofs4_follow_mount(struct path *path)
-{
-        int res = 0;
-        while (d_mountpoint(path->dentry)) {
-                int followed = follow_down(path);
-                if (!followed)
-                        break;
-                res = 1;
-        }
-        return res;
-}
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
 {
        return new_encode_dev(sbi->sb->s_dev);
@@ -294,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
        return;
 }
-void autofs4_dentry_release(struct dentry *);
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469a..1442da4860e5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                err = have_submounts(path.dentry);
-                if (follow_down(&path))
+                if (follow_down_one(&path))
                        magic = path.mnt->mnt_sb->s_magic;
        }
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cc1d01365905..f43100b9662b 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
        if (ino == NULL)
                return 0;
-        /* No point expiring a pending mount */
-        if (ino->flags & AUTOFS_INF_PENDING)
-                return 0;
        if (!do_now) {
                /* Too young to die */
                if (!timeout || time_after(ino->last_used + timeout, now))
@@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        path_get(&path);
-        if (!follow_down(&path))
+        if (!follow_down_one(&path))
                goto done;
        if (is_autofs4_dentry(path.dentry)) {
@@ -100,7 +96,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
        struct dentry *p, *ret;
        if (prev == NULL)
-                return dget(prev);
+                return dget(root);
        spin_lock(&autofs4_lock);
 relock:
@@ -137,7 +133,7 @@ again:
        spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
        /* Negative dentry - try next */
        if (!simple_positive(ret)) {
-                spin_unlock(&ret->d_lock);
+                spin_unlock(&p->d_lock);
                p = ret;
                goto again;
        }
@@ -283,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
        unsigned long timeout;
        struct dentry *root = dget(sb->s_root);
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
+        struct autofs_info *ino;
        if (!root)
                return NULL;
@@ -291,19 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
        timeout = sbi->exp_timeout;
        spin_lock(&sbi->fs_lock);
+        ino = autofs4_dentry_ino(root);
+        /* No point expiring a pending mount */
+        if (ino->flags & AUTOFS_INF_PENDING) {
+                spin_unlock(&sbi->fs_lock);
+                return NULL;
+        }
+        managed_dentry_set_transit(root);
        if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
                struct autofs_info *ino = autofs4_dentry_ino(root);
-                if (d_mountpoint(root)) {
-                        ino->flags |= AUTOFS_INF_MOUNTPOINT;
-                        spin_lock(&root->d_lock);
-                        root->d_flags &= ~DCACHE_MOUNTED;
-                        spin_unlock(&root->d_lock);
-                }
                ino->flags |= AUTOFS_INF_EXPIRING;
                init_completion(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                return root;
        }
+        managed_dentry_clear_transit(root);
        spin_unlock(&sbi->fs_lock);
        dput(root);
@@ -340,6 +339,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        while ((dentry = get_next_positive_dentry(dentry, root))) {
                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(dentry);
+                /* No point expiring a pending mount */
+                if (ino->flags & AUTOFS_INF_PENDING)
+                        goto cont;
+                managed_dentry_set_transit(dentry);
                /*
                 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -399,6 +402,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        }
                }
 next:
+                managed_dentry_clear_transit(dentry);
+cont:
                spin_unlock(&sbi->fs_lock);
        }
        return NULL;
@@ -479,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb,
        spin_lock(&sbi->fs_lock);
        ino = autofs4_dentry_ino(dentry);
        ino->flags &= ~AUTOFS_INF_EXPIRING;
+        if (!d_unhashed(dentry))
+                managed_dentry_clear_transit(dentry);
        complete_all(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
@@ -504,18 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
                spin_lock(&sbi->fs_lock);
-                if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-                        spin_lock(&sb->s_root->d_lock);
-                        /*
-                         * If we haven't been expired away, then reset
-                         * mounted status.
-                         */
-                        if (mnt->mnt_parent != mnt)
-                                sb->s_root->d_flags |= DCACHE_MOUNTED;
-                        spin_unlock(&sb->s_root->d_lock);
-                        ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
-                }
                ino->flags &= ~AUTOFS_INF_EXPIRING;
+                spin_lock(&dentry->d_lock);
+                if (ret)
+                        __managed_dentry_clear_transit(dentry);
+                else {
+                        if ((IS_ROOT(dentry) ||
+                            (autofs_type_indirect(sbi->type) &&
+                             IS_ROOT(dentry->d_parent))) &&
+                            !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                                __managed_dentry_set_automount(dentry);
+                }
+                spin_unlock(&dentry->d_lock);
                complete_all(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a7bdb9dcac84..180fa2425e49 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,77 +22,27 @@
 #include "autofs_i.h"
 #include <linux/module.h>
-static void ino_lnkfree(struct autofs_info *ino)
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 {
-        if (ino->u.symlink) {
+        struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
-                kfree(ino->u.symlink);
+        if (ino) {
-                ino->u.symlink = NULL;
-        }
-}
-struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
-                                     struct autofs_sb_info *sbi, mode_t mode)
-{
-        int reinit = 1;
-        if (ino == NULL) {
-                reinit = 0;
-                ino = kmalloc(sizeof(*ino), GFP_KERNEL);
-        }
-        if (ino == NULL)
-                return NULL;
-        if (!reinit) {
-                ino->flags = 0;
-                ino->inode = NULL;
-                ino->dentry = NULL;
-                ino->size = 0;
                INIT_LIST_HEAD(&ino->active);
-                ino->active_count = 0;
                INIT_LIST_HEAD(&ino->expiring);
-                atomic_set(&ino->count, 0);
+                ino->last_used = jiffies;
+                ino->sbi = sbi;
        }
+        return ino;
+}
+void autofs4_clean_ino(struct autofs_info *ino)
+{
        ino->uid = 0;
        ino->gid = 0;
-        ino->mode = mode;
        ino->last_used = jiffies;
-        ino->sbi = sbi;
-        if (reinit && ino->free)
-                (ino->free)(ino);
-        memset(&ino->u, 0, sizeof(ino->u));
-        ino->free = NULL;
-        if (S_ISLNK(mode))
-                ino->free = ino_lnkfree;
-        return ino;
 }
 void autofs4_free_ino(struct autofs_info *ino)
 {
-        struct autofs_info *p_ino;
-        if (ino->dentry) {
-                ino->dentry->d_fsdata = NULL;
-                if (ino->dentry->d_inode) {
-                        struct dentry *parent = ino->dentry->d_parent;
-                        if (atomic_dec_and_test(&ino->count)) {
-                                p_ino = autofs4_dentry_ino(parent);
-                                if (p_ino && parent != ino->dentry)
-                                        atomic_dec(&p_ino->count);
-                        }
-                        dput(ino->dentry);
-                }
-                ino->dentry = NULL;
-        }
-        if (ino->free)
-                (ino->free)(ino);
        kfree(ino);
 }
@@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
+static void autofs4_evict_inode(struct inode *inode)
+{
+        end_writeback(inode);
+        kfree(inode->i_private);
+}
 static const struct super_operations autofs4_sops = {
        .statfs         = simple_statfs,
        .show_options   = autofs4_show_options,
+        .evict_inode    = autofs4_evict_inode,
 };
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
        return (*pipefd < 0);
 }
-static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
-{
-        struct autofs_info *ino;
-        ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
-        if (!ino)
-                return NULL;
-        return ino;
-}
-static const struct dentry_operations autofs4_sb_dentry_operations = {
-        .d_release      = autofs4_dentry_release,
-};
 int autofs4_fill_super(struct super_block *s, void *data, int silent)
 {
        struct inode * root_inode;
@@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        s->s_blocksize_bits = 10;
        s->s_magic = AUTOFS_SUPER_MAGIC;
        s->s_op = &autofs4_sops;
+        s->s_d_op = &autofs4_dentry_operations;
        s->s_time_gran = 1;
        /*
         * Get the root inode and dentry, but defer checking for errors.
         */
-        ino = autofs4_mkroot(sbi);
+        ino = autofs4_new_ino(sbi);
        if (!ino)
                goto fail_free;
-        root_inode = autofs4_get_inode(s, ino);
+        root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
        if (!root_inode)
                goto fail_ino;
@@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                goto fail_iput;
        pipe = NULL;
-        d_set_d_op(root, &autofs4_sb_dentry_operations);
        root->d_fsdata = ino;
        /* Can this call block? */
@@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                goto fail_dput;
        }
+        if (autofs_type_trigger(sbi->type))
+                __managed_dentry_set_managed(root);
        root_inode->i_fop = &autofs4_root_operations;
-        root_inode->i_op = autofs_type_trigger(sbi->type) ?
+        root_inode->i_op = &autofs4_dir_inode_operations;
-                        &autofs4_direct_root_inode_operations :
-                        &autofs4_indirect_root_inode_operations;
        /* Couldn't this be tested earlier? */
        if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
@@ -383,16 +326,14 @@ fail_unlock:
        return -EINVAL;
 }
-struct inode *autofs4_get_inode(struct super_block *sb,
+struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
-                                struct autofs_info *inf)
 {
        struct inode *inode = new_inode(sb);
        if (inode == NULL)
                return NULL;
-        inf->inode = inode;
+        inode->i_mode = mode;
-        inode->i_mode = inf->mode;
        if (sb->s_root) {
                inode->i_uid = sb->s_root->d_inode->i_uid;
                inode->i_gid = sb->s_root->d_inode->i_gid;
@@ -400,12 +341,11 @@ struct inode *autofs4_get_inode(struct super_block *sb,
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_ino = get_next_ino();
-        if (S_ISDIR(inf->mode)) {
+        if (S_ISDIR(mode)) {
                inode->i_nlink = 2;
                inode->i_op = &autofs4_dir_inode_operations;
                inode->i_fop = &autofs4_dir_operations;
-        } else if (S_ISLNK(inf->mode)) {
+        } else if (S_ISLNK(mode)) {
-                inode->i_size = inf->size;
                inode->i_op = &autofs4_symlink_inode_operations;
        }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 651e4ef563b1..014e7aba3b08 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -35,10 +35,9 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 #endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
-static void *autofs4_follow_link(struct dentry *, struct nameidata *);
+static struct vfsmount *autofs4_d_automount(struct path *);
+static int autofs4_d_manage(struct dentry *, bool, bool);
-#define TRIGGER_FLAGS   (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
+static void autofs4_dentry_release(struct dentry *);
-#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
 const struct file_operations autofs4_root_operations = {
        .open           = dcache_dir_open,
@@ -60,7 +59,7 @@ const struct file_operations autofs4_dir_operations = {
        .llseek         = dcache_dir_lseek,
 };
-const struct inode_operations autofs4_indirect_root_inode_operations = {
+const struct inode_operations autofs4_dir_inode_operations = {
        .lookup         = autofs4_lookup,
        .unlink         = autofs4_dir_unlink,
        .symlink        = autofs4_dir_symlink,
@@ -68,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = {
        .rmdir          = autofs4_dir_rmdir,
 };
-const struct inode_operations autofs4_direct_root_inode_operations = {
+const struct dentry_operations autofs4_dentry_operations = {
-        .lookup         = autofs4_lookup,
+        .d_automount    = autofs4_d_automount,
-        .unlink         = autofs4_dir_unlink,
+        .d_manage       = autofs4_d_manage,
-        .mkdir          = autofs4_dir_mkdir,
+        .d_release      = autofs4_dentry_release,
-        .rmdir          = autofs4_dir_rmdir,
-        .follow_link    = autofs4_follow_link,
-};
-const struct inode_operations autofs4_dir_inode_operations = {
-        .lookup         = autofs4_lookup,
-        .unlink         = autofs4_dir_unlink,
-        .symlink        = autofs4_dir_symlink,
-        .mkdir          = autofs4_dir_mkdir,
-        .rmdir          = autofs4_dir_rmdir,
 };
 static void autofs4_add_active(struct dentry *dentry)
@@ -116,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
        return;
 }
-static unsigned int autofs4_need_mount(unsigned int flags)
-{
-        unsigned int res = 0;
-        if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
-                res = 1;
-        return res;
-}
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
@@ -158,278 +139,27 @@ out:
        return dcache_dir_open(inode, file);
 }
-static int try_to_fill_dentry(struct dentry *dentry, int flags)
+static void autofs4_dentry_release(struct dentry *de)
-{
-        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
-        int status;
-        DPRINTK("dentry=%p %.*s ino=%p",
-                 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-        /*
-         * Wait for a pending mount, triggering one if there
-         * isn't one already
-         */
-        if (dentry->d_inode == NULL) {
-                DPRINTK("waiting for mount name=%.*s",
-                         dentry->d_name.len, dentry->d_name.name);
-                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                DPRINTK("mount done status=%d", status);
-                /* Turn this into a real negative dentry? */
-                if (status == -ENOENT) {
-                        spin_lock(&sbi->fs_lock);
-                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&sbi->fs_lock);
-                        return status;
-                } else if (status) {
-                        /* Return a negative dentry, but leave it "pending" */
-                        return status;
-                }
-        /* Trigger mount for path component or follow link */
-        } else if (ino->flags & AUTOFS_INF_PENDING ||
-                        autofs4_need_mount(flags)) {
-                DPRINTK("waiting for mount name=%.*s",
-                        dentry->d_name.len, dentry->d_name.name);
-                spin_lock(&sbi->fs_lock);
-                ino->flags |= AUTOFS_INF_PENDING;
-                spin_unlock(&sbi->fs_lock);
-                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                DPRINTK("mount done status=%d", status);
-                if (status) {
-                        spin_lock(&sbi->fs_lock);
-                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&sbi->fs_lock);
-                        return status;
-                }
-        }
-        /* Initialize expiry counter after successful mount */
-        ino->last_used = jiffies;
-        spin_lock(&sbi->fs_lock);
-        ino->flags &= ~AUTOFS_INF_PENDING;
-        spin_unlock(&sbi->fs_lock);
-        return 0;
-}
-/* For autofs direct mounts the follow link triggers the mount */
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
-        int oz_mode = autofs4_oz_mode(sbi);
-        unsigned int lookup_type;
-        int status;
-        DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
-                dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
-                nd->flags);
-        /*
-         * For an expire of a covered direct or offset mount we need
-         * to break out of follow_down() at the autofs mount trigger
-         * (d_mounted--), so we can see the expiring flag, and manage
-         * the blocking and following here until the expire is completed.
-         */
-        if (oz_mode) {
-                spin_lock(&sbi->fs_lock);
-                if (ino->flags & AUTOFS_INF_EXPIRING) {
-                        spin_unlock(&sbi->fs_lock);
-                        /* Follow down to our covering mount. */
-                        if (!follow_down(&nd->path))
-                                goto done;
-                        goto follow;
-                }
-                spin_unlock(&sbi->fs_lock);
-                goto done;
-        }
-        /* If an expire request is pending everyone must wait. */
-        autofs4_expire_wait(dentry);
-        /* We trigger a mount for almost all flags */
-        lookup_type = autofs4_need_mount(nd->flags);
-        spin_lock(&sbi->fs_lock);
-        spin_lock(&autofs4_lock);
-        spin_lock(&dentry->d_lock);
-        if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
-                spin_unlock(&dentry->d_lock);
-                spin_unlock(&autofs4_lock);
-                spin_unlock(&sbi->fs_lock);
-                goto follow;
-        }
-        /*
-         * If the dentry contains directories then it is an autofs
-         * multi-mount with no root mount offset. So don't try to
-         * mount it again.
-         */
-        if (ino->flags & AUTOFS_INF_PENDING ||
-            (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-                spin_unlock(&dentry->d_lock);
-                spin_unlock(&autofs4_lock);
-                spin_unlock(&sbi->fs_lock);
-                status = try_to_fill_dentry(dentry, nd->flags);
-                if (status)
-                        goto out_error;
-                goto follow;
-        }
-        spin_unlock(&dentry->d_lock);
-        spin_unlock(&autofs4_lock);
-        spin_unlock(&sbi->fs_lock);
-follow:
-        /*
-         * If there is no root mount it must be an autofs
-         * multi-mount with no root offset so we don't need
-         * to follow it.
-         */
-        if (d_mountpoint(dentry)) {
-                if (!autofs4_follow_mount(&nd->path)) {
-                        status = -ENOENT;
-                        goto out_error;
-                }
-        }
-done:
-        return NULL;
-out_error:
-        path_put(&nd->path);
-        return ERR_PTR(status);
-}
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *dir;
+        struct autofs_info *ino = autofs4_dentry_ino(de);
-        struct autofs_sb_info *sbi;
+        struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-        int oz_mode;
-        int flags = nd ? nd->flags : 0;
-        int status = 1;
-        if (flags & LOOKUP_RCU)
-                return -ECHILD;
-        dir = dentry->d_parent->d_inode;
-        sbi = autofs4_sbi(dir->i_sb);
-        oz_mode = autofs4_oz_mode(sbi);
-        /* Pending dentry */
-        spin_lock(&sbi->fs_lock);
-        if (autofs4_ispending(dentry)) {
-                /* The daemon never causes a mount to trigger */
-                spin_unlock(&sbi->fs_lock);
-                if (oz_mode)
-                        return 1;
-                /*
-                 * If the directory has gone away due to an expire
-                 * we have been called as ->d_revalidate() and so
-                 * we need to return false and proceed to ->lookup().
-                 */
-                if (autofs4_expire_wait(dentry) == -EAGAIN)
-                        return 0;
-                /*
-                 * A zero status is success otherwise we have a
-                 * negative error code.
-                 */
-                status = try_to_fill_dentry(dentry, flags);
-                if (status == 0)
-                        return 1;
-                return status;
-        }
-        spin_unlock(&sbi->fs_lock);
-        /* Negative dentry.. invalidate if "old" */
-        if (dentry->d_inode == NULL)
-                return 0;
-        /* Check for a non-mountpoint directory with no contents */
-        spin_lock(&autofs4_lock);
-        spin_lock(&dentry->d_lock);
-        if (S_ISDIR(dentry->d_inode->i_mode) &&
-            !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                DPRINTK("dentry=%p %.*s, emptydir",
-                         dentry, dentry->d_name.len, dentry->d_name.name);
-                spin_unlock(&dentry->d_lock);
-                spin_unlock(&autofs4_lock);
-                /* The daemon never causes a mount to trigger */
-                if (oz_mode)
-                        return 1;
-                /*
-                 * A zero status is success otherwise we have a
-                 * negative error code.
-                 */
-                status = try_to_fill_dentry(dentry, flags);
-                if (status == 0)
-                        return 1;
-                return status;
-        }
-        spin_unlock(&dentry->d_lock);
-        spin_unlock(&autofs4_lock);
-        return 1;
-}
-void autofs4_dentry_release(struct dentry *de)
-{
-        struct autofs_info *inf;
        DPRINTK("releasing %p", de);
-        inf = autofs4_dentry_ino(de);
+        if (!ino)
-        de->d_fsdata = NULL;
+                return;
-        if (inf) {
-                struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-                if (sbi) {
-                        spin_lock(&sbi->lookup_lock);
-                        if (!list_empty(&inf->active))
-                                list_del(&inf->active);
-                        if (!list_empty(&inf->expiring))
-                                list_del(&inf->expiring);
-                        spin_unlock(&sbi->lookup_lock);
-                }
-                inf->dentry = NULL;
-                inf->inode = NULL;
-                autofs4_free_ino(inf);
+        if (sbi) {
+                spin_lock(&sbi->lookup_lock);
+                if (!list_empty(&ino->active))
+                        list_del(&ino->active);
+                if (!list_empty(&ino->expiring))
+                        list_del(&ino->expiring);
+                spin_unlock(&sbi->lookup_lock);
        }
-}
-/* For dentries of directories in the root dir */
+        autofs4_free_ino(ino);
-static const struct dentry_operations autofs4_root_dentry_operations = {
+}
-        .d_revalidate   = autofs4_revalidate,
-        .d_release      = autofs4_dentry_release,
-};
-/* For other dentries */
-static const struct dentry_operations autofs4_dentry_operations = {
-        .d_revalidate   = autofs4_revalidate,
-        .d_release      = autofs4_dentry_release,
-};
 static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 {
@@ -541,51 +271,246 @@ next:
        return NULL;
 }
+static int autofs4_mount_wait(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        int status;
+        if (ino->flags & AUTOFS_INF_PENDING) {
+                DPRINTK("waiting for mount name=%.*s",
+                        dentry->d_name.len, dentry->d_name.name);
+                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+                DPRINTK("mount wait done status=%d", status);
+                ino->last_used = jiffies;
+                return status;
+        }
+        return 0;
+}
+static int do_expire_wait(struct dentry *dentry)
+{
+        struct dentry *expiring;
+        expiring = autofs4_lookup_expiring(dentry);
+        if (!expiring)
+                return autofs4_expire_wait(dentry);
+        else {
+                /*
+                 * If we are racing with expire the request might not
+                 * be quite complete, but the directory has been removed
+                 * so it must have been successful, just wait for it.
+                 */
+                autofs4_expire_wait(expiring);
+                autofs4_del_expiring(expiring);
+                dput(expiring);
+        }
+        return 0;
+}
+static struct dentry *autofs4_mountpoint_changed(struct path *path)
+{
+        struct dentry *dentry = path->dentry;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        /*
+         * If this is an indirect mount the dentry could have gone away
+         * as a result of an expire and a new one created.
+         */
+        if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
+                struct dentry *parent = dentry->d_parent;
+                struct dentry *new = d_lookup(parent, &dentry->d_name);
+                if (!new)
+                        return NULL;
+                dput(path->dentry);
+                path->dentry = new;
+        }
+        return path->dentry;
+}
+static struct vfsmount *autofs4_d_automount(struct path *path)
+{
+        struct dentry *dentry = path->dentry;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        int status;
+        DPRINTK("dentry=%p %.*s",
+                dentry, dentry->d_name.len, dentry->d_name.name);
+        /*
+         * Someone may have manually umounted this or it was a submount
+         * that has gone away.
+         */
+        spin_lock(&dentry->d_lock);
+        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+                if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+                     (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                        __managed_dentry_set_transit(path->dentry);
+        }
+        spin_unlock(&dentry->d_lock);
+        /* The daemon never triggers a mount. */
+        if (autofs4_oz_mode(sbi))
+                return NULL;
+        /*
+         * If an expire request is pending everyone must wait.
+         * If the expire fails we're still mounted so continue
+         * the follow and return. A return of -EAGAIN (which only
+         * happens with indirect mounts) means the expire completed
+         * and the directory was removed, so just go ahead and try
+         * the mount.
+         */
+        status = do_expire_wait(dentry);
+        if (status && status != -EAGAIN)
+                return NULL;
+        /* Callback to the daemon to perform the mount or wait */
+        spin_lock(&sbi->fs_lock);
+        if (ino->flags & AUTOFS_INF_PENDING) {
+                spin_unlock(&sbi->fs_lock);
+                status = autofs4_mount_wait(dentry);
+                if (status)
+                        return ERR_PTR(status);
+                spin_lock(&sbi->fs_lock);
+                goto done;
+        }
+        /*
+         * If the dentry is a symlink it's equivalent to a directory
+         * having d_mountpoint() true, so there's no need to call back
+         * to the daemon.
+         */
+        if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+                goto done;
+        if (!d_mountpoint(dentry)) {
+                /*
+                 * It's possible that user space hasn't removed directories
+                 * after umounting a rootless multi-mount, although it
+                 * should. For v5 have_submounts() is sufficient to handle
+                 * this because the leaves of the directory tree under the
+                 * mount never trigger mounts themselves (they have an autofs
+                 * trigger mount mounted on them). But v4 pseudo direct mounts
+                 * do need the leaves to to trigger mounts. In this case we
+                 * have no choice but to use the list_empty() check and
+                 * require user space behave.
+                 */
+                if (sbi->version > 4) {
+                        if (have_submounts(dentry))
+                                goto done;
+                } else {
+                        spin_lock(&dentry->d_lock);
+                        if (!list_empty(&dentry->d_subdirs)) {
+                                spin_unlock(&dentry->d_lock);
+                                goto done;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                }
+                ino->flags |= AUTOFS_INF_PENDING;
+                spin_unlock(&sbi->fs_lock);
+                status = autofs4_mount_wait(dentry);
+                if (status)
+                        return ERR_PTR(status);
+                spin_lock(&sbi->fs_lock);
+                ino->flags &= ~AUTOFS_INF_PENDING;
+        }
+done:
+        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+                /*
+                 * Any needed mounting has been completed and the path updated
+                 * so turn this into a normal dentry so we don't continually
+                 * call ->d_automount() and ->d_manage().
+                 */
+                spin_lock(&dentry->d_lock);
+                __managed_dentry_clear_transit(dentry);
+                /*
+                 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
+                 * symlinks as in all other cases the dentry will be covered by
+                 * an actual mount so ->d_automount() won't be called during
+                 * the follow.
+                 */
+                if ((!d_mountpoint(dentry) &&
+                    !list_empty(&dentry->d_subdirs)) ||
+                    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+                        __managed_dentry_clear_automount(dentry);
+                spin_unlock(&dentry->d_lock);
+        }
+        spin_unlock(&sbi->fs_lock);
+        /* Mount succeeded, check if we ended up with a new dentry */
+        dentry = autofs4_mountpoint_changed(path);
+        if (!dentry)
+                return ERR_PTR(-ENOENT);
+        return NULL;
+}
+int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        DPRINTK("dentry=%p %.*s",
+                dentry, dentry->d_name.len, dentry->d_name.name);
+        /* The daemon never waits. */
+        if (autofs4_oz_mode(sbi) || mounting_here) {
+                if (!d_mountpoint(dentry))
+                        return -EISDIR;
+                return 0;
+        }
+        /* We need to sleep, so we need pathwalk to be in ref-mode */
+        if (rcu_walk)
+                return -ECHILD;
+        /* Wait for pending expires */
+        do_expire_wait(dentry);
+        /*
+         * This dentry may be under construction so wait on mount
+         * completion.
+         */
+        return autofs4_mount_wait(dentry);
+}
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
        struct autofs_sb_info *sbi;
        struct autofs_info *ino;
-        struct dentry *expiring, *active;
+        struct dentry *active;
-        int oz_mode;
-        DPRINTK("name = %.*s",
+        DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
-                dentry->d_name.len, dentry->d_name.name);
        /* File name too long to exist */
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
        sbi = autofs4_sbi(dir->i_sb);
-        oz_mode = autofs4_oz_mode(sbi);
        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
-                 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
+                current->pid, task_pgrp_nr(current), sbi->catatonic,
+                autofs4_oz_mode(sbi));
        active = autofs4_lookup_active(dentry);
        if (active) {
-                dentry = active;
+                return active;
-                ino = autofs4_dentry_ino(dentry);
        } else {
                /*
-                 * Mark the dentry incomplete but don't hash it. We do this
+                 * A dentry that is not within the root can never trigger a
-                 * to serialize our inode creation operations (symlink and
+                 * mount operation, unless the directory already exists, so we
-                 * mkdir) which prevents deadlock during the callback to
+                 * can return fail immediately.  The daemon however does need
-                 * the daemon. Subsequent user space lookups for the same
+                 * to create directories within the file system.
-                 * dentry are placed on the wait queue while the daemon
-                 * itself is allowed passage unresticted so the create
-                 * operation itself can then hash the dentry. Finally,
-                 * we check for the hashed dentry and return the newly
-                 * hashed dentry.
                 */
-                d_set_d_op(dentry, &autofs4_root_dentry_operations);
+                if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
+                        return ERR_PTR(-ENOENT);
-                /*
+                /* Mark entries in the root as mount triggers */
-                 * And we need to ensure that the same dentry is used for
+                if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
-                 * all following lookup calls until it is hashed so that
+                        __managed_dentry_set_managed(dentry);
-                 * the dentry flags are persistent throughout the request.
-                 */
+                ino = autofs4_new_ino(sbi);
-                ino = autofs4_init_ino(NULL, sbi, 0555);
                if (!ino)
                        return ERR_PTR(-ENOMEM);
@@ -596,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                d_instantiate(dentry, NULL);
        }
-        if (!oz_mode) {
-                mutex_unlock(&dir->i_mutex);
-                expiring = autofs4_lookup_expiring(dentry);
-                if (expiring) {
-                        /*
-                         * If we are racing with expire the request might not
-                         * be quite complete but the directory has been removed
-                         * so it must have been successful, so just wait for it.
-                         */
-                        autofs4_expire_wait(expiring);
-                        autofs4_del_expiring(expiring);
-                        dput(expiring);
-                }
-                spin_lock(&sbi->fs_lock);
-                ino->flags |= AUTOFS_INF_PENDING;
-                spin_unlock(&sbi->fs_lock);
-                if (dentry->d_op && dentry->d_op->d_revalidate)
-                        (dentry->d_op->d_revalidate)(dentry, nd);
-                mutex_lock(&dir->i_mutex);
-        }
-        /*
-         * If we are still pending, check if we had to handle
-         * a signal. If so we can force a restart..
-         */
-        if (ino->flags & AUTOFS_INF_PENDING) {
-                /* See if we were interrupted */
-                if (signal_pending(current)) {
-                        sigset_t *sigset = &current->pending.signal;
-                        if (sigismember (sigset, SIGKILL) ||
-                            sigismember (sigset, SIGQUIT) ||
-                            sigismember (sigset, SIGINT)) {
-                            if (active)
-                                dput(active);
-                            return ERR_PTR(-ERESTARTNOINTR);
-                        }
-                }
-                if (!oz_mode) {
-                        spin_lock(&sbi->fs_lock);
-                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&sbi->fs_lock);
-                }
-        }
-        /*
-         * If this dentry is unhashed, then we shouldn't honour this
-         * lookup.  Returning ENOENT here doesn't do the right thing
-         * for all system calls, but it should be OK for the operations
-         * we permit from an autofs.
-         */
-        if (!oz_mode && d_unhashed(dentry)) {
-                /*
-                 * A user space application can (and has done in the past)
-                 * remove and re-create this directory during the callback.
-                 * This can leave us with an unhashed dentry, but a
-                 * successful mount!  So we need to perform another
-                 * cached lookup in case the dentry now exists.
-                 */
-                struct dentry *parent = dentry->d_parent;
-                struct dentry *new = d_lookup(parent, &dentry->d_name);
-                if (new != NULL)
-                        dentry = new;
-                else
-                        dentry = ERR_PTR(-ENOENT);
-                if (active)
-                        dput(active);
-                return dentry;
-        }
-        if (active)
-                return active;
        return NULL;
 }
@@ -683,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir,
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        struct autofs_info *p_ino;
        struct inode *inode;
+        size_t size = strlen(symname);
        char *cp;
        DPRINTK("%s <- %.*s", symname,
@@ -691,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir,
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
+        BUG_ON(!ino);
-        if (!ino)
-                return -ENOMEM;
+        autofs4_clean_ino(ino);
        autofs4_del_active(dentry);
-        ino->size = strlen(symname);
+        cp = kmalloc(size + 1, GFP_KERNEL);
-        cp = kmalloc(ino->size + 1, GFP_KERNEL);
+        if (!cp)
-        if (!cp) {
-                if (!dentry->d_fsdata)
-                        kfree(ino);
                return -ENOMEM;
-        }
        strcpy(cp, symname);
-        inode = autofs4_get_inode(dir->i_sb, ino);
+        inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
        if (!inode) {
                kfree(cp);
                if (!dentry->d_fsdata)
                        kfree(ino);
                return -ENOMEM;
        }
+        inode->i_private = cp;
+        inode->i_size = size;
        d_add(dentry, inode);
-        if (dir == dir->i_sb->s_root->d_inode)
+        dget(dentry);
-                d_set_d_op(dentry, &autofs4_root_dentry_operations);
-        else
-                d_set_d_op(dentry, &autofs4_dentry_operations);
-        dentry->d_fsdata = ino;
-        ino->dentry = dget(dentry);
        atomic_inc(&ino->count);
        p_ino = autofs4_dentry_ino(dentry->d_parent);
        if (p_ino && dentry->d_parent != dentry)
                atomic_inc(&p_ino->count);
-        ino->inode = inode;
-        ino->u.symlink = cp;
        dir->i_mtime = CURRENT_TIME;
        return 0;
@@ -782,6 +622,58 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        return 0;
 }
+/*
+ * Version 4 of autofs provides a pseudo direct mount implementation
+ * that relies on directories at the leaves of a directory tree under
+ * an indirect mount to trigger mounts. To allow for this we need to
+ * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
+ * of the directory tree. There is no need to clear the automount flag
+ * following a mount or restore it after an expire because these mounts
+ * are always covered. However, it is neccessary to ensure that these
+ * flags are clear on non-empty directories to avoid unnecessary calls
+ * during path walks.
+ */
+static void autofs_set_leaf_automount_flags(struct dentry *dentry)
+{
+        struct dentry *parent;
+        /* root and dentrys in the root are already handled */
+        if (IS_ROOT(dentry->d_parent))
+                return;
+        managed_dentry_set_managed(dentry);
+        parent = dentry->d_parent;
+        /* only consider parents below dentrys in the root */
+        if (IS_ROOT(parent->d_parent))
+                return;
+        managed_dentry_clear_managed(parent);
+        return;
+}
+static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
+{
+        struct list_head *d_child;
+        struct dentry *parent;
+        /* flags for dentrys in the root are handled elsewhere */
+        if (IS_ROOT(dentry->d_parent))
+                return;
+        managed_dentry_clear_managed(dentry);
+        parent = dentry->d_parent;
+        /* only consider parents below dentrys in the root */
+        if (IS_ROOT(parent->d_parent))
+                return;
+        d_child = &dentry->d_u.d_child;
+        /* Set parent managed if it's becoming empty */
+        if (d_child->next == &parent->d_subdirs &&
+            d_child->prev == &parent->d_subdirs)
+                managed_dentry_set_managed(parent);
+        return;
+}
 static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -809,6 +701,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        spin_unlock(&dentry->d_lock);
        spin_unlock(&autofs4_lock);
+        if (sbi->version < 5)
+                autofs_clear_leaf_automount_flags(dentry);
        if (atomic_dec_and_test(&ino->count)) {
                p_ino = autofs4_dentry_ino(dentry->d_parent);
                if (p_ino && dentry->d_parent != dentry)
@@ -837,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        DPRINTK("dentry %p, creating %.*s",
                dentry, dentry->d_name.len, dentry->d_name.name);
-        ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
+        BUG_ON(!ino);
-        if (!ino)
-                return -ENOMEM;
+        autofs4_clean_ino(ino);
        autofs4_del_active(dentry);
-        inode = autofs4_get_inode(dir->i_sb, ino);
+        inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
-        if (!inode) {
+        if (!inode)
-                if (!dentry->d_fsdata)
-                        kfree(ino);
                return -ENOMEM;
-        }
        d_add(dentry, inode);
-        if (dir == dir->i_sb->s_root->d_inode)
+        if (sbi->version < 5)
-                d_set_d_op(dentry, &autofs4_root_dentry_operations);
+                autofs_set_leaf_automount_flags(dentry);
-        else
-                d_set_d_op(dentry, &autofs4_dentry_operations);
-        dentry->d_fsdata = ino;
+        dget(dentry);
-        ino->dentry = dget(dentry);
        atomic_inc(&ino->count);
        p_ino = autofs4_dentry_ino(dentry->d_parent);
        if (p_ino && dentry->d_parent != dentry)
                atomic_inc(&p_ino->count);
-        ino->inode = inode;
        inc_nlink(dir);
        dir->i_mtime = CURRENT_TIME;
@@ -944,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
 int is_autofs4_dentry(struct dentry *dentry)
 {
        return dentry && dentry->d_inode &&
-                (dentry->d_op == &autofs4_root_dentry_operations ||
+                dentry->d_op == &autofs4_dentry_operations &&
-                 dentry->d_op == &autofs4_dentry_operations) &&
                dentry->d_fsdata != NULL;
 }
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2e..f27c094a1919 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
 static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        nd_set_link(nd, dentry->d_inode->i_private);
-        nd_set_link(nd, (char *)ino->u.symlink);
        return NULL;
 }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index c5f8459c905e..56010056b2e6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -309,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
         * completed while we waited on the mutex ...
         */
        if (notify == NFY_MOUNT) {
+                struct dentry *new = NULL;
+                int valid = 1;
                /*
                 * If the dentry was successfully mounted while we slept
                 * on the wait queue mutex we can return success. If it
@@ -316,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
                 * a multi-mount with no mount at it's base) we can
                 * continue on and create a new request.
                 */
+                if (!IS_ROOT(dentry)) {
+                        if (dentry->d_inode && d_unhashed(dentry)) {
+                                struct dentry *parent = dentry->d_parent;
+                                new = d_lookup(parent, &dentry->d_name);
+                                if (new)
+                                        dentry = new;
+                        }
+                }
                if (have_submounts(dentry))
-                        return 0;
+                        valid = 0;
+                if (new)
+                        dput(new);
+                return valid;
        }
        return 1;
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 6cb84d896d05..27223878ba9f 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
 }
 static inline befs_data_stream
-fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
+fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
 {
        befs_data_stream data;
        int i;
        for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
-                data.direct[i] = fsrun_to_cpu(sb, n.direct[i]);
+                data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
-        data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range);
+        data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
-        data.indirect = fsrun_to_cpu(sb, n.indirect);
+        data.indirect = fsrun_to_cpu(sb, n->indirect);
-        data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range);
+        data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
-        data.double_indirect = fsrun_to_cpu(sb, n.double_indirect);
+        data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
        data.max_double_indirect_range = fs64_to_cpu(sb,
-                                                     n.
+                                                     n->
                                                     max_double_indirect_range);
-        data.size = fs64_to_cpu(sb, n.size);
+        data.size = fs64_to_cpu(sb, n->size);
        return data;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index de93581b79a2..b1d0c794747b 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -390,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
                int num_blks;
                befs_ino->i_data.ds =
-                    fsds_to_cpu(sb, raw_inode->data.datastream);
+                    fsds_to_cpu(sb, &raw_inode->data.datastream);
                num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
                inode->i_blocks =
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6884e198e0c7..d5b640ba6cb1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 static struct linux_binfmt elf_format = {
-                .module         = THIS_MODULE,
+        .module         = THIS_MODULE,
-                .load_binary    = load_elf_binary,
+        .load_binary    = load_elf_binary,
-                .load_shlib     = load_elf_library,
+        .load_shlib     = load_elf_library,
-                .core_dump      = elf_core_dump,
+        .core_dump      = elf_core_dump,
-                .min_coredump   = ELF_EXEC_PAGESIZE,
+        .min_coredump   = ELF_EXEC_PAGESIZE,
-                .hasvdso        = 1
 };
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        return 0;
 }
-#ifndef elf_map
 static unsigned long elf_map(struct file *filep, unsigned long addr,
                struct elf_phdr *eppnt, int prot, int type,
                unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
        return(map_addr);
 }
-#endif /* !elf_map */
 static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
 {
        int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                goto out;
        retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
-                             (char *)elf_phdata,size);
+                             (char *)elf_phdata, size);
        error = -EIO;
        if (retval != size) {
                if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                goto out;
        if (!elf_check_arch(&loc->elf_ex))
                goto out;
-        if (!bprm->file->f_op||!bprm->file->f_op->mmap)
+        if (!bprm->file->f_op || !bprm->file->f_op->mmap)
                goto out;
        /* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        /* There was a PT_LOAD segment with p_memsz > p_filesz
                           before this one. Map anonymous pages, if needed,
                           and clear the area.  */
-                        retval = set_brk (elf_bss + load_bias,
+                        retval = set_brk(elf_bss + load_bias,
-                                          elf_brk + load_bias);
+                                         elf_brk + load_bias);
                        if (retval) {
                                send_sig(SIGKILL, current, 0);
                                goto out_free_dentry;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4d0ff5ee27b8..e49cce234c65 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -782,7 +782,12 @@ void __init bio_integrity_init(void)
 {
        unsigned int i;
-        kintegrityd_wq = create_workqueue("kintegrityd");
+        /*
+         * kintegrityd won't block much but may burn a lot of CPU cycles.
+         * Make it highpri CPU intensive wq with max concurrency of 1.
+         */
+        kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+                                         WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
        if (!kintegrityd_wq)
                panic("Failed to create kintegrityd\n");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 771f23527010..333a7bb4cb9c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -433,7 +433,7 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&bdev->bd_inodes);
        INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
-        INIT_LIST_HEAD(&bdev->bd_holder_list);
+        INIT_LIST_HEAD(&bdev->bd_holder_disks);
 #endif
        inode_init_once(&ei->vfs_inode);
        /* Initialize mutex for freeze. */
@@ -473,7 +473,7 @@ static const struct super_operations bdev_sops = {
 static struct dentry *bd_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
-        return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+        return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
 }
 static struct file_system_type bd_type = {
@@ -669,7 +669,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
        else if (bdev->bd_contains == bdev)
                return true;     /* is a whole device which isn't held */
-        else if (whole->bd_holder == bd_claim)
+        else if (whole->bd_holder == bd_may_claim)
                return true;     /* is a partition of a device that is being partitioned */
        else if (whole->bd_holder != NULL)
                return false;    /* is a partition of a held device */
@@ -781,439 +781,142 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
        }
 }
-/* releases bdev_lock */
+#ifdef CONFIG_SYSFS
-static void __bd_abort_claiming(struct block_device *whole, void *holder)
+struct bd_holder_disk {
-{
+        struct list_head        list;
-        BUG_ON(whole->bd_claiming != holder);
+        struct gendisk          *disk;
-        whole->bd_claiming = NULL;
+        int                     refcnt;
-        wake_up_bit(&whole->bd_claiming, 0);
+};
-        spin_unlock(&bdev_lock);
-        bdput(whole);
-}
-/**
- * bd_abort_claiming - abort claiming a block device
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Abort a claiming block started by bd_start_claiming().  Note that
- * @whole is not the block device to be claimed but the whole device
- * returned by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_abort_claiming(struct block_device *whole, void *holder)
-{
-        spin_lock(&bdev_lock);
-        __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
-}
-/* increment holders when we have a legitimate claim. requires bdev_lock */
-static void __bd_claim(struct block_device *bdev, struct block_device *whole,
-                                        void *holder)
-{
-        /* note that for a whole device bd_holders
-         * will be incremented twice, and bd_holder will
-         * be set to bd_claim before being set to holder
-         */
-        whole->bd_holders++;
-        whole->bd_holder = bd_claim;
-        bdev->bd_holders++;
-        bdev->bd_holder = holder;
-}
-/**
- * bd_finish_claiming - finish claiming a block device
- * @bdev: block device of interest (passed to bd_start_claiming())
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Finish a claiming block started by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_finish_claiming(struct block_device *bdev,
-                                struct block_device *whole, void *holder)
-{
-        spin_lock(&bdev_lock);
-        BUG_ON(!bd_may_claim(bdev, whole, holder));
-        __bd_claim(bdev, whole, holder);
-        __bd_abort_claiming(whole, holder); /* not actually an abort */
-}
-/**
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
- * bd_claim - claim a block device
+                                                  struct gendisk *disk)
- * @bdev: block device to claim
- * @holder: holder trying to claim @bdev
- *
- * Try to claim @bdev which must have been opened successfully.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 if successful, -EBUSY if @bdev is already claimed.
- */
-int bd_claim(struct block_device *bdev, void *holder)
 {
-        struct block_device *whole = bdev->bd_contains;
+        struct bd_holder_disk *holder;
-        int res;
-        might_sleep();
+        list_for_each_entry(holder, &bdev->bd_holder_disks, list)
+                if (holder->disk == disk)
-        spin_lock(&bdev_lock);
+                        return holder;
-        res = bd_prepare_to_claim(bdev, whole, holder);
+        return NULL;
-        if (res == 0)
-                __bd_claim(bdev, whole, holder);
-        spin_unlock(&bdev_lock);
-        return res;
-}
-EXPORT_SYMBOL(bd_claim);
-void bd_release(struct block_device *bdev)
-{
-        spin_lock(&bdev_lock);
-        if (!--bdev->bd_contains->bd_holders)
-                bdev->bd_contains->bd_holder = NULL;
-        if (!--bdev->bd_holders)
-                bdev->bd_holder = NULL;
-        spin_unlock(&bdev_lock);
 }
-EXPORT_SYMBOL(bd_release);
-#ifdef CONFIG_SYSFS
-/*
- * Functions for bd_claim_by_kobject / bd_release_from_kobject
- *
- *     If a kobject is passed to bd_claim_by_kobject()
- *     and the kobject has a parent directory,
- *     following symlinks are created:
- *        o from the kobject to the claimed bdev
- *        o from "holders" directory of the bdev to the parent of the kobject
- *     bd_release_from_kobject() removes these symlinks.
- *
- *     Example:
- *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
- *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
- *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
- *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- */
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
-        if (!from || !to)
-                return 0;
        return sysfs_create_link(from, to, kobject_name(to));
 }
 static void del_symlink(struct kobject *from, struct kobject *to)
 {
-        if (!from || !to)
-                return;
        sysfs_remove_link(from, kobject_name(to));
 }
-/*
- * 'struct bd_holder' contains pointers to kobjects symlinked by
- * bd_claim_by_kobject.
- * It's connected to bd_holder_list which is protected by bdev->bd_sem.
- */
-struct bd_holder {
-        struct list_head list;  /* chain of holders of the bdev */
-        int count;              /* references from the holder */
-        struct kobject *sdir;   /* holder object, e.g. "/block/dm-0/slaves" */
-        struct kobject *hdev;   /* e.g. "/block/dm-0" */
-        struct kobject *hdir;   /* e.g. "/block/sda/holders" */
-        struct kobject *sdev;   /* e.g. "/block/sda" */
-};
-/*
- * Get references of related kobjects at once.
- * Returns 1 on success. 0 on failure.
- *
- * Should call bd_holder_release_dirs() after successful use.
- */
-static int bd_holder_grab_dirs(struct block_device *bdev,
-                        struct bd_holder *bo)
-{
-        if (!bdev || !bo)
-                return 0;
-        bo->sdir = kobject_get(bo->sdir);
-        if (!bo->sdir)
-                return 0;
-        bo->hdev = kobject_get(bo->sdir->parent);
-        if (!bo->hdev)
-                goto fail_put_sdir;
-        bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
-        if (!bo->sdev)
-                goto fail_put_hdev;
-        bo->hdir = kobject_get(bdev->bd_part->holder_dir);
-        if (!bo->hdir)
-                goto fail_put_sdev;
-        return 1;
-fail_put_sdev:
-        kobject_put(bo->sdev);
-fail_put_hdev:
-        kobject_put(bo->hdev);
-fail_put_sdir:
-        kobject_put(bo->sdir);
-        return 0;
-}
-/* Put references of related kobjects at once. */
-static void bd_holder_release_dirs(struct bd_holder *bo)
-{
-        kobject_put(bo->hdir);
-        kobject_put(bo->sdev);
-        kobject_put(bo->hdev);
-        kobject_put(bo->sdir);
-}
-static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
-{
-        struct bd_holder *bo;
-        bo = kzalloc(sizeof(*bo), GFP_KERNEL);
-        if (!bo)
-                return NULL;
-        bo->count = 1;
-        bo->sdir = kobj;
-        return bo;
-}
-static void free_bd_holder(struct bd_holder *bo)
-{
-        kfree(bo);
-}
 /**
- * find_bd_holder - find matching struct bd_holder from the block device
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
 *
- * @bdev:       struct block device to be searched
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- * @bo:         target struct bd_holder
 *
- * Returns matching entry with @bo in @bdev->bd_holder_list.
+ * This functions creates the following sysfs symlinks.
- * If found, increment the reference count and return the pointer.
+ *
- * If not found, returns NULL.
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
- */
+ * - from "holders" directory of the @bdev to the holder @disk
-static struct bd_holder *find_bd_holder(struct block_device *bdev,
+ *
-                                        struct bd_holder *bo)
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
-{
+ * passed to bd_link_disk_holder(), then:
-        struct bd_holder *tmp;
-        list_for_each_entry(tmp, &bdev->bd_holder_list, list)
-                if (tmp->sdir == bo->sdir) {
-                        tmp->count++;
-                        return tmp;
-                }
-        return NULL;
-}
-/**
- * add_bd_holder - create sysfs symlinks for bd_claim() relationship
 *
- * @bdev:       block device to be bd_claimed
+ *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
- * @bo:         preallocated and initialized by alloc_bd_holder()
+ *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
 *
- * Add @bo to @bdev->bd_holder_list, create symlinks.
+ * The caller must have claimed @bdev before calling this function and
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
 *
- * Returns 0 if symlinks are created.
+ * CONTEXT:
- * Returns -ve if something fails.
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
 */
-static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
-        int err;
+        struct bd_holder_disk *holder;
+        int ret = 0;
-        if (!bo)
+        mutex_lock(&bdev->bd_mutex);
-                return -EINVAL;
-        if (!bd_holder_grab_dirs(bdev, bo))
+        WARN_ON_ONCE(!bdev->bd_holder);
-                return -EBUSY;
-        err = add_symlink(bo->sdir, bo->sdev);
+        /* FIXME: remove the following once add_disk() handles errors */
-        if (err)
+        if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
-                return err;
+                goto out_unlock;
-        err = add_symlink(bo->hdir, bo->hdev);
+        holder = bd_find_holder_disk(bdev, disk);
-        if (err) {
+        if (holder) {
-                del_symlink(bo->sdir, bo->sdev);
+                holder->refcnt++;
-                return err;
+                goto out_unlock;
        }
-        list_add_tail(&bo->list, &bdev->bd_holder_list);
+        holder = kzalloc(sizeof(*holder), GFP_KERNEL);
-        return 0;
+        if (!holder) {
-}
+                ret = -ENOMEM;
+                goto out_unlock;
-/**
- * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
- *
- * @bdev:       block device to be bd_claimed
- * @kobj:       holder's kobject
- *
- * If there is matching entry with @kobj in @bdev->bd_holder_list
- * and no other bd_claim() from the same kobject,
- * remove the struct bd_holder from the list, delete symlinks for it.
- *
- * Returns a pointer to the struct bd_holder when it's removed from the list
- * and ready to be freed.
- * Returns NULL if matching claim isn't found or there is other bd_claim()
- * by the same kobject.
- */
-static struct bd_holder *del_bd_holder(struct block_device *bdev,
-                                        struct kobject *kobj)
-{
-        struct bd_holder *bo;
-        list_for_each_entry(bo, &bdev->bd_holder_list, list) {
-                if (bo->sdir == kobj) {
-                        bo->count--;
-                        BUG_ON(bo->count < 0);
-                        if (!bo->count) {
-                                list_del(&bo->list);
-                                del_symlink(bo->sdir, bo->sdev);
-                                del_symlink(bo->hdir, bo->hdev);
-                                bd_holder_release_dirs(bo);
-                                return bo;
-                        }
-                        break;
-                }
        }
-        return NULL;
+        INIT_LIST_HEAD(&holder->list);
-}
+        holder->disk = disk;
+        holder->refcnt = 1;
-/**
- * bd_claim_by_kobject - bd_claim() with additional kobject signature
- *
- * @bdev:       block device to be claimed
- * @holder:     holder's signature
- * @kobj:       holder's kobject
- *
- * Do bd_claim() and if it succeeds, create sysfs symlinks between
- * the bdev and the holder's kobject.
- * Use bd_release_from_kobject() when relesing the claimed bdev.
- *
- * Returns 0 on success. (same as bd_claim())
- * Returns errno on failure.
- */
-static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
-                                struct kobject *kobj)
-{
-        int err;
-        struct bd_holder *bo, *found;
-        if (!kobj)
-                return -EINVAL;
-        bo = alloc_bd_holder(kobj);
-        if (!bo)
-                return -ENOMEM;
-        mutex_lock(&bdev->bd_mutex);
+        ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+        if (ret)
+                goto out_free;
-        err = bd_claim(bdev, holder);
+        ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
-        if (err)
+        if (ret)
-                goto fail;
+                goto out_del;
-        found = find_bd_holder(bdev, bo);
+        list_add(&holder->list, &bdev->bd_holder_disks);
-        if (found)
+        goto out_unlock;
-                goto fail;
-        err = add_bd_holder(bdev, bo);
+out_del:
-        if (err)
+        del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
-                bd_release(bdev);
+out_free:
-        else
+        kfree(holder);
-                bo = NULL;
+out_unlock:
-fail:
        mutex_unlock(&bdev->bd_mutex);
-        free_bd_holder(bo);
+        return ret;
-        return err;
 }
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 /**
- * bd_release_from_kobject - bd_release() with additional kobject signature
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
 *
- * @bdev:       block device to be released
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- * @kobj:       holder's kobject
 *
- * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
+ * CONTEXT:
+ * Might sleep.
 */
-static void bd_release_from_kobject(struct block_device *bdev,
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
-                                        struct kobject *kobj)
 {
-        if (!kobj)
+        struct bd_holder_disk *holder;
-                return;
        mutex_lock(&bdev->bd_mutex);
-        bd_release(bdev);
-        free_bd_holder(del_bd_holder(bdev, kobj));
-        mutex_unlock(&bdev->bd_mutex);
-}
-/**
+        holder = bd_find_holder_disk(bdev, disk);
- * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
- *
- * @bdev:       block device to be claimed
- * @holder:     holder's signature
- * @disk:       holder's gendisk
- *
- * Call bd_claim_by_kobject() with getting @disk->slave_dir.
- */
-int bd_claim_by_disk(struct block_device *bdev, void *holder,
-                        struct gendisk *disk)
-{
-        return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
-}
-EXPORT_SYMBOL_GPL(bd_claim_by_disk);
-/**
+        if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
- * bd_release_from_disk - wrapper function for bd_release_from_kobject()
+                del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
- *
+                del_symlink(bdev->bd_part->holder_dir,
- * @bdev:       block device to be claimed
+                            &disk_to_dev(disk)->kobj);
- * @disk:       holder's gendisk
+                list_del_init(&holder->list);
- *
+                kfree(holder);
- * Call bd_release_from_kobject() and put @disk->slave_dir.
+        }
- */
-void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
-{
-        bd_release_from_kobject(bdev, disk->slave_dir);
-        kobject_put(disk->slave_dir);
-}
-EXPORT_SYMBOL_GPL(bd_release_from_disk);
-#endif
-/*
+        mutex_unlock(&bdev->bd_mutex);
- * Tries to open block device by device number.  Use it ONLY if you
- * really do not have anything better - i.e. when you are behind a
- * truly sucky interface and all you are given is a device number.  _Never_
- * to be used for internal purposes.  If you ever need it - reconsider
- * your API.
- */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
-{
-        struct block_device *bdev = bdget(dev);
-        int err = -ENOMEM;
-        if (bdev)
-                err = blkdev_get(bdev, mode);
-        return err ? ERR_PTR(err) : bdev;
 }
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
-EXPORT_SYMBOL(open_by_devnum);
+#endif
 /**
 * flush_disk - invalidates all buffer-cache entries on a disk
@@ -1309,10 +1012,11 @@ int check_disk_change(struct block_device *bdev)
 {
        struct gendisk *disk = bdev->bd_disk;
        const struct block_device_operations *bdops = disk->fops;
+        unsigned int events;
-        if (!bdops->media_changed)
+        events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
-                return 0;
+                                   DISK_EVENT_EJECT_REQUEST);
-        if (!bdops->media_changed(bdev->bd_disk))
+        if (!(events & DISK_EVENT_MEDIA_CHANGE))
                return 0;
        flush_disk(bdev);
@@ -1475,17 +1179,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        return ret;
 }
-int blkdev_get(struct block_device *bdev, fmode_t mode)
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged.  On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
-        return __blkdev_get(bdev, mode, 0);
+        struct block_device *whole = NULL;
+        int res;
+        WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+        if ((mode & FMODE_EXCL) && holder) {
+                whole = bd_start_claiming(bdev, holder);
+                if (IS_ERR(whole)) {
+                        bdput(bdev);
+                        return PTR_ERR(whole);
+                }
+        }
+        res = __blkdev_get(bdev, mode, 0);
+        /* __blkdev_get() may alter read only status, check it afterwards */
+        if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+                __blkdev_put(bdev, mode, 0);
+                res = -EACCES;
+        }
+        if (whole) {
+                /* finish claiming */
+                mutex_lock(&bdev->bd_mutex);
+                spin_lock(&bdev_lock);
+                if (!res) {
+                        BUG_ON(!bd_may_claim(bdev, whole, holder));
+                        /*
+                         * Note that for a whole device bd_holders
+                         * will be incremented twice, and bd_holder
+                         * will be set to bd_may_claim before being
+                         * set to holder
+                         */
+                        whole->bd_holders++;
+                        whole->bd_holder = bd_may_claim;
+                        bdev->bd_holders++;
+                        bdev->bd_holder = holder;
+                }
+                /* tell others that we're done */
+                BUG_ON(whole->bd_claiming != holder);
+                whole->bd_claiming = NULL;
+                wake_up_bit(&whole->bd_claiming, 0);
+                spin_unlock(&bdev_lock);
+                /*
+                 * Block event polling for write claims.  Any write
+                 * holder makes the write_holder state stick until all
+                 * are released.  This is good enough and tracking
+                 * individual writeable reference is too fragile given
+                 * the way @mode is used in blkdev_get/put().
+                 */
+                if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+                        bdev->bd_write_holder = true;
+                        disk_block_events(bdev->bd_disk);
+                }
+                mutex_unlock(&bdev->bd_mutex);
+                bdput(whole);
+        }
+        return res;
 }
 EXPORT_SYMBOL(blkdev_get);
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path.  @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+                                        void *holder)
+{
+        struct block_device *bdev;
+        int err;
+        bdev = lookup_bdev(path);
+        if (IS_ERR(bdev))
+                return bdev;
+        err = blkdev_get(bdev, mode, holder);
+        if (err)
+                return ERR_PTR(err);
+        return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev.  @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number.  _Never_ to be used for internal purposes.  If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+        struct block_device *bdev;
+        int err;
+        bdev = bdget(dev);
+        if (!bdev)
+                return ERR_PTR(-ENOMEM);
+        err = blkdev_get(bdev, mode, holder);
+        if (err)
+                return ERR_PTR(err);
+        return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
-        struct block_device *whole = NULL;
        struct block_device *bdev;
-        int res;
        /*
         * Preserve backwards compatibility and allow large file access
@@ -1506,26 +1364,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (bdev == NULL)
                return -ENOMEM;
-        if (filp->f_mode & FMODE_EXCL) {
-                whole = bd_start_claiming(bdev, filp);
-                if (IS_ERR(whole)) {
-                        bdput(bdev);
-                        return PTR_ERR(whole);
-                }
-        }
        filp->f_mapping = bdev->bd_inode->i_mapping;
-        res = blkdev_get(bdev, filp->f_mode);
+        return blkdev_get(bdev, filp->f_mode, filp);
-        if (whole) {
-                if (res == 0)
-                        bd_finish_claiming(bdev, whole, filp);
-                else
-                        bd_abort_claiming(whole, filp);
-        }
-        return res;
 }
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1539,6 +1380,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                bdev->bd_part_count--;
        if (!--bdev->bd_openers) {
+                WARN_ON_ONCE(bdev->bd_holders);
                sync_blockdev(bdev);
                kill_bdev(bdev);
        }
@@ -1569,6 +1411,44 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
+        if (mode & FMODE_EXCL) {
+                bool bdev_free;
+                /*
+                 * Release a claim on the device.  The holder fields
+                 * are protected with bdev_lock.  bd_mutex is to
+                 * synchronize disk_holder unlinking.
+                 */
+                mutex_lock(&bdev->bd_mutex);
+                spin_lock(&bdev_lock);
+                WARN_ON_ONCE(--bdev->bd_holders < 0);
+                WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+                /* bd_contains might point to self, check in a separate step */
+                if ((bdev_free = !bdev->bd_holders))
+                        bdev->bd_holder = NULL;
+                if (!bdev->bd_contains->bd_holders)
+                        bdev->bd_contains->bd_holder = NULL;
+                spin_unlock(&bdev_lock);
+                /*
+                 * If this was the last claim, remove holder link and
+                 * unblock evpoll if it was a write holder.
+                 */
+                if (bdev_free) {
+                        if (bdev->bd_write_holder) {
+                                disk_unblock_events(bdev->bd_disk);
+                                bdev->bd_write_holder = false;
+                        } else
+                                disk_check_events(bdev->bd_disk);
+                }
+                mutex_unlock(&bdev->bd_mutex);
+        } else
+                disk_check_events(bdev->bd_disk);
        return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
@@ -1576,8 +1456,7 @@ EXPORT_SYMBOL(blkdev_put);
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-        if (bdev->bd_holder == filp)
-                bd_release(bdev);
        return blkdev_put(bdev, filp->f_mode);
 }
@@ -1722,67 +1601,6 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
-/**
- * open_bdev_exclusive  -  open a block device by name and set it up for use
- *
- * @path:       special file representing the block device
- * @mode:       FMODE_... combination to pass be used
- * @holder:     owner for exclusion
- *
- * Open the blockdevice described by the special file at @path, claim it
- * for the @holder.
- */
-struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
-{
-        struct block_device *bdev, *whole;
-        int error;
-        bdev = lookup_bdev(path);
-        if (IS_ERR(bdev))
-                return bdev;
-        whole = bd_start_claiming(bdev, holder);
-        if (IS_ERR(whole)) {
-                bdput(bdev);
-                return whole;
-        }
-        error = blkdev_get(bdev, mode);
-        if (error)
-                goto out_abort_claiming;
-        error = -EACCES;
-        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-                goto out_blkdev_put;
-        bd_finish_claiming(bdev, whole, holder);
-        return bdev;
-out_blkdev_put:
-        blkdev_put(bdev, mode);
-out_abort_claiming:
-        bd_abort_claiming(whole, holder);
-        return ERR_PTR(error);
-}
-EXPORT_SYMBOL(open_bdev_exclusive);
-/**
- * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
- *
- * @bdev:       blockdevice to close
- * @mode:       mode, must match that used to open.
- *
- * This is the counterpart to open_bdev_exclusive().
- */
-void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
-{
-        bd_release(bdev);
-        blkdev_put(bdev, mode);
-}
-EXPORT_SYMBOL(close_bdev_exclusive);
 int __invalidate_device(struct block_device *bdev)
 {
        struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
        select LIBCRC32C
        select ZLIB_INFLATE
        select ZLIB_DEFLATE
+        select LZO_COMPRESS
+        select LZO_DECOMPRESS
        help
          Btrfs is a new filesystem with extents, writable snapshotting,
          support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..31610ea73aec 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-           export.o tree-log.o acl.o free-space-cache.o zlib.o \
+           export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ae2c8cac9d5..15b5ca2a2606 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
-                        if (IS_ERR(acl))
+                        if (IS_ERR(acl)) {
+                                kfree(value);
                                return acl;
+                        }
                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..ccc991c542df 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
        /*
         * always compress this one file
         */
-        unsigned force_compress:1;
+        unsigned force_compress:4;
        struct inode vfs_inode;
 };
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b50bc4bd5c56..f745287fbf2e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
        /* number of bytes on disk */
        unsigned long compressed_len;
+        /* the compression algorithm for this bio */
+        int compress_type;
        /* number of compressed pages in the array */
        unsigned long nr_pages;
@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
        /* ok, we're the last bio for this extent, lets start
         * the decompression.
         */
-        ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+        ret = btrfs_decompress_biovec(cb->compress_type,
-                                        cb->start,
+                                      cb->compressed_pages,
-                                        cb->orig_bio->bi_io_vec,
+                                      cb->start,
-                                        cb->orig_bio->bi_vcnt,
+                                      cb->orig_bio->bi_io_vec,
-                                        cb->compressed_len);
+                                      cb->orig_bio->bi_vcnt,
+                                      cb->compressed_len);
 csum_failed:
        if (ret)
                cb->errors = 1;
@@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        cb->len = uncompressed_len;
        cb->compressed_len = compressed_len;
+        cb->compress_type = extent_compress_type(bio_flags);
        cb->orig_bio = bio;
        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -677,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        bio_put(comp_bio);
        return 0;
 }
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+struct btrfs_compress_op *btrfs_compress_op[] = {
+        &btrfs_zlib_compress,
+        &btrfs_lzo_compress,
+};
+int __init btrfs_init_compress(void)
+{
+        int i;
+        for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+                INIT_LIST_HEAD(&comp_idle_workspace[i]);
+                spin_lock_init(&comp_workspace_lock[i]);
+                atomic_set(&comp_alloc_workspace[i], 0);
+                init_waitqueue_head(&comp_workspace_wait[i]);
+        }
+        return 0;
+}
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+        struct list_head *workspace;
+        int cpus = num_online_cpus();
+        int idx = type - 1;
+        struct list_head *idle_workspace        = &comp_idle_workspace[idx];
+        spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
+        atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
+        wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
+        int *num_workspace                      = &comp_num_workspace[idx];
+again:
+        spin_lock(workspace_lock);
+        if (!list_empty(idle_workspace)) {
+                workspace = idle_workspace->next;
+                list_del(workspace);
+                (*num_workspace)--;
+                spin_unlock(workspace_lock);
+                return workspace;
+        }
+        if (atomic_read(alloc_workspace) > cpus) {
+                DEFINE_WAIT(wait);
+                spin_unlock(workspace_lock);
+                prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+                if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+                        schedule();
+                finish_wait(workspace_wait, &wait);
+                goto again;
+        }
+        atomic_inc(alloc_workspace);
+        spin_unlock(workspace_lock);
+        workspace = btrfs_compress_op[idx]->alloc_workspace();
+        if (IS_ERR(workspace)) {
+                atomic_dec(alloc_workspace);
+                wake_up(workspace_wait);
+        }
+        return workspace;
+}
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+        int idx = type - 1;
+        struct list_head *idle_workspace        = &comp_idle_workspace[idx];
+        spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
+        atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
+        wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
+        int *num_workspace                      = &comp_num_workspace[idx];
+        spin_lock(workspace_lock);
+        if (*num_workspace < num_online_cpus()) {
+                list_add_tail(workspace, idle_workspace);
+                (*num_workspace)++;
+                spin_unlock(workspace_lock);
+                goto wake;
+        }
+        spin_unlock(workspace_lock);
+        btrfs_compress_op[idx]->free_workspace(workspace);
+        atomic_dec(alloc_workspace);
+wake:
+        if (waitqueue_active(workspace_wait))
+                wake_up(workspace_wait);
+}
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+        struct list_head *workspace;
+        int i;
+        for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+                while (!list_empty(&comp_idle_workspace[i])) {
+                        workspace = comp_idle_workspace[i].next;
+                        list_del(workspace);
+                        btrfs_compress_op[i]->free_workspace(workspace);
+                        atomic_dec(&comp_alloc_workspace[i]);
+                }
+        }
+}
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+                         u64 start, unsigned long len,
+                         struct page **pages,
+                         unsigned long nr_dest_pages,
+                         unsigned long *out_pages,
+                         unsigned long *total_in,
+                         unsigned long *total_out,
+                         unsigned long max_out)
+{
+        struct list_head *workspace;
+        int ret;
+        workspace = find_workspace(type);
+        if (IS_ERR(workspace))
+                return -1;
+        ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+                                                      start, len, pages,
+                                                      nr_dest_pages, out_pages,
+                                                      total_in, total_out,
+                                                      max_out);
+        free_workspace(type, workspace);
+        return ret;
+}
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+                            struct bio_vec *bvec, int vcnt, size_t srclen)
+{
+        struct list_head *workspace;
+        int ret;
+        workspace = find_workspace(type);
+        if (IS_ERR(workspace))
+                return -ENOMEM;
+        ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+                                                         disk_start,
+                                                         bvec, vcnt, srclen);
+        free_workspace(type, workspace);
+        return ret;
+}
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+                     unsigned long start_byte, size_t srclen, size_t destlen)
+{
+        struct list_head *workspace;
+        int ret;
+        workspace = find_workspace(type);
+        if (IS_ERR(workspace))
+                return -ENOMEM;
+        ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+                                                  dest_page, start_byte,
+                                                  srclen, destlen);
+        free_workspace(type, workspace);
+        return ret;
+}
+void __exit btrfs_exit_compress(void)
+{
+        free_workspaces();
+}
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+                              unsigned long total_out, u64 disk_start,
+                              struct bio_vec *bvec, int vcnt,
+                              unsigned long *page_index,
+                              unsigned long *pg_offset)
+{
+        unsigned long buf_offset;
+        unsigned long current_buf_start;
+        unsigned long start_byte;
+        unsigned long working_bytes = total_out - buf_start;
+        unsigned long bytes;
+        char *kaddr;
+        struct page *page_out = bvec[*page_index].bv_page;
+        /*
+         * start byte is the first byte of the page we're currently
+         * copying into relative to the start of the compressed data.
+         */
+        start_byte = page_offset(page_out) - disk_start;
+        /* we haven't yet hit data corresponding to this page */
+        if (total_out <= start_byte)
+                return 1;
+        /*
+         * the start of the data we care about is offset into
+         * the middle of our working buffer
+         */
+        if (total_out > start_byte && buf_start < start_byte) {
+                buf_offset = start_byte - buf_start;
+                working_bytes -= buf_offset;
+        } else {
+                buf_offset = 0;
+        }
+        current_buf_start = buf_start;
+        /* copy bytes from the working buffer into the pages */
+        while (working_bytes > 0) {
+                bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+                            PAGE_CACHE_SIZE - buf_offset);
+                bytes = min(bytes, working_bytes);
+                kaddr = kmap_atomic(page_out, KM_USER0);
+                memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+                kunmap_atomic(kaddr, KM_USER0);
+                flush_dcache_page(page_out);
+                *pg_offset += bytes;
+                buf_offset += bytes;
+                working_bytes -= bytes;
+                current_buf_start += bytes;
+                /* check if we need to pick another page */
+                if (*pg_offset == PAGE_CACHE_SIZE) {
+                        (*page_index)++;
+                        if (*page_index >= vcnt)
+                                return 0;
+                        page_out = bvec[*page_index].bv_page;
+                        *pg_offset = 0;
+                        start_byte = page_offset(page_out) - disk_start;
+                        /*
+                         * make sure our new page is covered by this
+                         * working buffer
+                         */
+                        if (total_out <= start_byte)
+                                return 1;
+                        /*
+                         * the next page in the biovec might not be adjacent
+                         * to the last page, but it might still be found
+                         * inside this working buffer. bump our offset pointer
+                         */
+                        if (total_out > start_byte &&
+                            current_buf_start < start_byte) {
+                                buf_offset = start_byte - buf_start;
+                                working_bytes = total_out - start_byte;
+                                current_buf_start = buf_start + buf_offset;
+                        }
+                }
+        }
+        return 1;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..51000174b9d7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
 #ifndef __BTRFS_COMPRESSION_
 #define __BTRFS_COMPRESSION_
-int btrfs_zlib_decompress(unsigned char *data_in,
+int btrfs_init_compress(void);
-                          struct page *dest_page,
+void btrfs_exit_compress(void);
-                          unsigned long start_byte,
-                          size_t srclen, size_t destlen);
+int btrfs_compress_pages(int type, struct address_space *mapping,
-int btrfs_zlib_compress_pages(struct address_space *mapping,
+                         u64 start, unsigned long len,
-                              u64 start, unsigned long len,
+                         struct page **pages,
-                              struct page **pages,
+                         unsigned long nr_dest_pages,
-                              unsigned long nr_dest_pages,
+                         unsigned long *out_pages,
-                              unsigned long *out_pages,
+                         unsigned long *total_in,
-                              unsigned long *total_in,
+                         unsigned long *total_out,
-                              unsigned long *total_out,
+                         unsigned long max_out);
-                              unsigned long max_out);
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                            struct bio_vec *bvec, int vcnt, size_t srclen);
-                              u64 disk_start,
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
-                              struct bio_vec *bvec,
+                     unsigned long start_byte, size_t srclen, size_t destlen);
-                              int vcnt,
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
-                              size_t srclen);
+                              unsigned long total_out, u64 disk_start,
-void btrfs_zlib_exit(void);
+                              struct bio_vec *bvec, int vcnt,
+                              unsigned long *page_index,
+                              unsigned long *pg_offset);
 int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                                  unsigned long len, u64 disk_start,
                                  unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                                  unsigned long nr_pages);
 int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags);
+struct btrfs_compress_op {
+        struct list_head *(*alloc_workspace)(void);
+        void (*free_workspace)(struct list_head *workspace);
+        int (*compress_pages)(struct list_head *workspace,
+                              struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out);
+        int (*decompress_biovec)(struct list_head *workspace,
+                                 struct page **pages_in,
+                                 u64 disk_start,
+                                 struct bio_vec *bvec,
+                                 int vcnt,
+                                 size_t srclen);
+        int (*decompress)(struct list_head *workspace,
+                          unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen);
+};
+extern struct btrfs_compress_op btrfs_zlib_compress;
+extern struct btrfs_compress_op btrfs_lzo_compress;
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac171599258..b5baff0dccfe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
+        if (!p)
+                return;
        btrfs_release_path(NULL, p);
        kmem_cache_free(btrfs_path_cachep, p);
 }
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        btrfs_assert_tree_locked(path->nodes[1]);
        right = read_node_slot(root, upper, slot + 1);
+        if (right == NULL)
+                return 1;
        btrfs_tree_lock(right);
        btrfs_set_lock_blocking(right);
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
        btrfs_assert_tree_locked(path->nodes[1]);
        left = read_node_slot(root, path->nodes[1], slot - 1);
+        if (left == NULL)
+                return 1;
        btrfs_tree_lock(left);
        btrfs_set_lock_blocking(left);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a142d204b526..2c98b3af6052 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/kobject.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
@@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 #define BTRFS_FSID_SIZE 16
 #define BTRFS_HEADER_FLAG_WRITTEN       (1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC         (1ULL << 1)
+/*
+ * File system states
+ */
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR          (1ULL << 2)
 #define BTRFS_SUPER_FLAG_SEEDING        (1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP       (1ULL << 33)
@@ -398,13 +407,15 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF    (1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL   (1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS     (1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO     (1ULL << 3)
 #define BTRFS_FEATURE_COMPAT_SUPP               0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP                     \
        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
-         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
+         BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -551,9 +562,11 @@ struct btrfs_timespec {
 } __attribute__ ((__packed__));
 enum btrfs_compression_type {
-        BTRFS_COMPRESS_NONE = 0,
+        BTRFS_COMPRESS_NONE  = 0,
-        BTRFS_COMPRESS_ZLIB = 1,
+        BTRFS_COMPRESS_ZLIB  = 1,
-        BTRFS_COMPRESS_LAST = 2,
+        BTRFS_COMPRESS_LZO   = 2,
+        BTRFS_COMPRESS_TYPES = 2,
+        BTRFS_COMPRESS_LAST  = 3,
 };
 struct btrfs_inode_item {
@@ -597,6 +610,8 @@ struct btrfs_dir_item {
        u8 type;
 } __attribute__ ((__packed__));
+#define BTRFS_ROOT_SUBVOL_RDONLY        (1ULL << 0)
 struct btrfs_root_item {
        struct btrfs_inode_item inode;
        __le64 generation;
@@ -895,7 +910,8 @@ struct btrfs_fs_info {
         */
        u64 last_trans_log_full_commit;
        u64 open_ioctl_trans;
-        unsigned long mount_opt;
+        unsigned long mount_opt:20;
+        unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
        struct btrfs_transaction *running_transaction;
@@ -1050,6 +1066,9 @@ struct btrfs_fs_info {
        unsigned metadata_ratio;
        void *bdev_holder;
+        /* filesystem state */
+        u64 fs_state;
 };
 /*
@@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
                         last_snapshot, 64);
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+        return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+}
 /* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+                                   u64 start, u64 end);
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                               u64 num_bytes);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2541,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* super.c */
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                     unsigned int line, int errno);
+#define btrfs_std_error(fs_info, errno)                         \
+do {                                                            \
+        if ((errno))                                            \
+                __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+} while (0)
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51d2e4de34eb..b531c36455d8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,20 @@
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                                    int read_only);
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                      struct btrfs_root *root);
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages,
+                                        int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                       struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
 /*
 * end_io_wq structs are used to do processing in task context when an IO is
@@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        WARN_ON(len == 0);
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        if (eb == NULL) {
+                WARN_ON(1);
+                goto out;
+        }
        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
                                             btrfs_header_generation(eb));
        BUG_ON(ret);
@@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        WARN_ON(len == 0);
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        if (eb == NULL) {
+                ret = -EIO;
+                goto out;
+        }
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
@@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        }
        btrfs_free_path(path);
        if (ret) {
+                kfree(root);
                if (ret > 0)
                        ret = -ENOENT;
                return ERR_PTR(ret);
@@ -1713,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-        if (!bh)
+        if (!bh) {
+                err = -EINVAL;
                goto fail_iput;
+        }
        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!btrfs_super_root(disk_super))
                goto fail_iput;
+        /* check FS state, whether FS is broken. */
+        fs_info->fs_state |= btrfs_super_flags(disk_super);
+        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
        ret = btrfs_parse_options(tree_root, options);
        if (ret) {
                err = ret;
@@ -1744,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        }
        features = btrfs_super_incompat_flags(disk_super);
-        if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+        features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-                features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+        if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
-                btrfs_set_super_incompat_flags(disk_super, features);
+                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
-        }
+        btrfs_set_super_incompat_flags(disk_super, features);
        features = btrfs_super_compat_ro_flags(disk_super) &
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1957,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
-        if (btrfs_super_log_root(disk_super) != 0) {
+        /* do not make disk changes in broken FS */
+        if (btrfs_super_log_root(disk_super) != 0 &&
+            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
                u64 bytenr = btrfs_super_log_root(disk_super);
                if (fs_devices->rw_devices == 0) {
@@ -2442,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
        smp_mb();
        btrfs_put_block_group_cache(fs_info);
+        /*
+         * Here come 2 situations when btrfs is broken to flip readonly:
+         *
+         * 1. when btrfs flips readonly somewhere else before
+         * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+         * and btrfs will skip to write sb directly to keep
+         * ERROR state on disk.
+         *
+         * 2. when btrfs flips readonly just in btrfs_commit_super,
+         * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+         * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+         * btrfs will cleanup all FS resources first and write sb then.
+         */
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-                ret =  btrfs_commit_super(root);
+                ret = btrfs_commit_super(root);
+                if (ret)
+                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+        }
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                ret = btrfs_error_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
@@ -2619,6 +2671,352 @@ out:
        return 0;
 }
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                              int read_only)
+{
+        if (read_only)
+                return;
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+                printk(KERN_WARNING "warning: mount fs with errors, "
+                       "running btrfsck is recommended\n");
+}
+int btrfs_error_commit_super(struct btrfs_root *root)
+{
+        int ret;
+        mutex_lock(&root->fs_info->cleaner_mutex);
+        btrfs_run_delayed_iputs(root);
+        mutex_unlock(&root->fs_info->cleaner_mutex);
+        down_write(&root->fs_info->cleanup_work_sem);
+        up_write(&root->fs_info->cleanup_work_sem);
+        /* cleanup FS via transaction */
+        btrfs_cleanup_transaction(root);
+        ret = write_ctree_super(NULL, root, 0);
+        return ret;
+}
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+{
+        struct btrfs_inode *btrfs_inode;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        mutex_lock(&root->fs_info->ordered_operations_mutex);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        list_splice_init(&root->fs_info->ordered_operations, &splice);
+        while (!list_empty(&splice)) {
+                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                         ordered_operations);
+                list_del_init(&btrfs_inode->ordered_operations);
+                btrfs_invalidate_inodes(btrfs_inode->root);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+        return 0;
+}
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+        struct list_head splice;
+        struct btrfs_ordered_extent *ordered;
+        struct inode *inode;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        list_splice_init(&root->fs_info->ordered_extents, &splice);
+        while (!list_empty(&splice)) {
+                ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+                                     root_extent_list);
+                list_del_init(&ordered->root_extent_list);
+                atomic_inc(&ordered->refs);
+                /* the inode may be getting freed (in sys_unlink path). */
+                inode = igrab(ordered->inode);
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+                if (inode)
+                        iput(inode);
+                atomic_set(&ordered->refs, 1);
+                btrfs_put_ordered_extent(ordered);
+                spin_lock(&root->fs_info->ordered_extent_lock);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        return 0;
+}
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                      struct btrfs_root *root)
+{
+        struct rb_node *node;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_delayed_ref_node *ref;
+        int ret = 0;
+        delayed_refs = &trans->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        if (delayed_refs->num_entries == 0) {
+                printk(KERN_INFO "delayed_refs has NO entry\n");
+                return ret;
+        }
+        node = rb_first(&delayed_refs->root);
+        while (node) {
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                node = rb_next(node);
+                ref->in_tree = 0;
+                rb_erase(&ref->rb_node, &delayed_refs->root);
+                delayed_refs->num_entries--;
+                atomic_set(&ref->refs, 1);
+                if (btrfs_delayed_ref_is_head(ref)) {
+                        struct btrfs_delayed_ref_head *head;
+                        head = btrfs_delayed_node_to_head(ref);
+                        mutex_lock(&head->mutex);
+                        kfree(head->extent_op);
+                        delayed_refs->num_heads--;
+                        if (list_empty(&head->cluster))
+                                delayed_refs->num_heads_ready--;
+                        list_del_init(&head->cluster);
+                        mutex_unlock(&head->mutex);
+                }
+                spin_unlock(&delayed_refs->lock);
+                btrfs_put_delayed_ref(ref);
+                cond_resched();
+                spin_lock(&delayed_refs->lock);
+        }
+        spin_unlock(&delayed_refs->lock);
+        return ret;
+}
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+{
+        struct btrfs_pending_snapshot *snapshot;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        list_splice_init(&t->pending_snapshots, &splice);
+        while (!list_empty(&splice)) {
+                snapshot = list_entry(splice.next,
+                                      struct btrfs_pending_snapshot,
+                                      list);
+                list_del_init(&snapshot->list);
+                kfree(snapshot);
+        }
+        return 0;
+}
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+        struct btrfs_inode *btrfs_inode;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+        spin_lock(&root->fs_info->delalloc_lock);
+        while (!list_empty(&splice)) {
+                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                    delalloc_inodes);
+                list_del_init(&btrfs_inode->delalloc_inodes);
+                btrfs_invalidate_inodes(btrfs_inode->root);
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
+        return 0;
+}
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages,
+                                        int mark)
+{
+        int ret;
+        struct page *page;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_buffer *eb;
+        u64 start = 0;
+        u64 end;
+        u64 offset;
+        unsigned long index;
+        while (1) {
+                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                            mark);
+                if (ret)
+                        break;
+                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+                while (start <= end) {
+                        index = start >> PAGE_CACHE_SHIFT;
+                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                        page = find_get_page(btree_inode->i_mapping, index);
+                        if (!page)
+                                continue;
+                        offset = page_offset(page);
+                        spin_lock(&dirty_pages->buffer_lock);
+                        eb = radix_tree_lookup(
+                             &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+                                               offset >> PAGE_CACHE_SHIFT);
+                        spin_unlock(&dirty_pages->buffer_lock);
+                        if (eb) {
+                                ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+                                                         &eb->bflags);
+                                atomic_set(&eb->refs, 1);
+                        }
+                        if (PageWriteback(page))
+                                end_page_writeback(page);
+                        lock_page(page);
+                        if (PageDirty(page)) {
+                                clear_page_dirty_for_io(page);
+                                spin_lock_irq(&page->mapping->tree_lock);
+                                radix_tree_tag_clear(&page->mapping->page_tree,
+                                                        page_index(page),
+                                                        PAGECACHE_TAG_DIRTY);
+                                spin_unlock_irq(&page->mapping->tree_lock);
+                        }
+                        page->mapping->a_ops->invalidatepage(page, 0);
+                        unlock_page(page);
+                }
+        }
+        return ret;
+}
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                       struct extent_io_tree *pinned_extents)
+{
+        struct extent_io_tree *unpin;
+        u64 start;
+        u64 end;
+        int ret;
+        unpin = pinned_extents;
+        while (1) {
+                ret = find_first_extent_bit(unpin, 0, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                /* opt_discard */
+                ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+                clear_extent_dirty(unpin, start, end, GFP_NOFS);
+                btrfs_error_unpin_extent_range(root, start, end);
+                cond_resched();
+        }
+        return 0;
+}
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+        struct btrfs_transaction *t;
+        LIST_HEAD(list);
+        WARN_ON(1);
+        mutex_lock(&root->fs_info->trans_mutex);
+        mutex_lock(&root->fs_info->transaction_kthread_mutex);
+        list_splice_init(&root->fs_info->trans_list, &list);
+        while (!list_empty(&list)) {
+                t = list_entry(list.next, struct btrfs_transaction, list);
+                if (!t)
+                        break;
+                btrfs_destroy_ordered_operations(root);
+                btrfs_destroy_ordered_extents(root);
+                btrfs_destroy_delayed_refs(t, root);
+                btrfs_block_rsv_release(root,
+                                        &root->fs_info->trans_block_rsv,
+                                        t->dirty_pages.dirty_bytes);
+                /* FIXME: cleanup wait for commit */
+                t->in_commit = 1;
+                t->blocked = 1;
+                if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+                        wake_up(&root->fs_info->transaction_blocked_wait);
+                t->blocked = 0;
+                if (waitqueue_active(&root->fs_info->transaction_wait))
+                        wake_up(&root->fs_info->transaction_wait);
+                mutex_unlock(&root->fs_info->trans_mutex);
+                mutex_lock(&root->fs_info->trans_mutex);
+                t->commit_done = 1;
+                if (waitqueue_active(&t->commit_wait))
+                        wake_up(&t->commit_wait);
+                mutex_unlock(&root->fs_info->trans_mutex);
+                mutex_lock(&root->fs_info->trans_mutex);
+                btrfs_destroy_pending_snapshots(t);
+                btrfs_destroy_delalloc_inodes(root);
+                spin_lock(&root->fs_info->new_trans_lock);
+                root->fs_info->running_transaction = NULL;
+                spin_unlock(&root->fs_info->new_trans_lock);
+                btrfs_destroy_marked_extents(root, &t->dirty_pages,
+                                             EXTENT_DIRTY);
+                btrfs_destroy_pinned_extent(root,
+                                            root->fs_info->pinned_extents);
+                t->use_count = 0;
+                list_del_init(&t->list);
+                memset(t, 0, sizeof(*t));
+                kmem_cache_free(btrfs_transaction_cachep, t);
+        }
+        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        return 0;
+}
 static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..07b20dc2fd95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
+int btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize);
 struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 0ccf9a8afcdf..9786963b07e5 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
        struct btrfs_root *root;
-        struct dentry *dentry;
        struct inode *inode;
        struct btrfs_key key;
        int index;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                return ERR_PTR(-ESTALE);
        }
-        dentry = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(dentry))
-                d_set_d_op(dentry, &btrfs_dentry_operations);
-        return dentry;
 fail:
        srcu_read_unlock(&fs_info->subvol_srcu, index);
        return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
        struct inode *dir = child->d_inode;
-        struct dentry *dentry;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -223,10 +218,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
-        if (!IS_ERR(dentry))
-                d_set_d_op(dentry, &btrfs_dentry_operations);
-        return dentry;
 fail:
        btrfs_free_path(path);
        return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 227e5815d838..b55269340cec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3089,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
        return btrfs_reduce_alloc_profile(root, flags);
 }
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
        u64 flags;
@@ -3161,8 +3161,12 @@ alloc:
                                             bytes + 2 * 1024 * 1024,
                                             alloc_target, 0);
                        btrfs_end_transaction(trans, root);
-                        if (ret < 0)
+                        if (ret < 0) {
-                                return ret;
+                                if (ret != -ENOSPC)
+                                        return ret;
+                                else
+                                        goto commit_trans;
+                        }
                        if (!data_sinfo) {
                                btrfs_set_inode_space_info(root, inode);
@@ -3173,6 +3177,7 @@ alloc:
                spin_unlock(&data_sinfo->lock);
                /* commit the current transaction and try again */
+commit_trans:
                if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
@@ -3721,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                return 0;
        }
-        WARN_ON(1);
-        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-                block_rsv->size, block_rsv->reserved,
-                block_rsv->freed[0], block_rsv->freed[1]);
        return -ENOSPC;
 }
@@ -7970,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
            sinfo->bytes_may_use + sinfo->bytes_readonly +
-            cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+            cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
                sinfo->bytes_reserved += cache->reserved_pinned;
                cache->reserved_pinned = 0;
                cache->ro = 1;
                ret = 0;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
        return ret;
@@ -8012,6 +8013,62 @@ out:
        return ret;
 }
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+{
+        struct btrfs_block_group_cache *block_group;
+        u64 free_bytes = 0;
+        int factor;
+        list_for_each_entry(block_group, groups_list, list) {
+                spin_lock(&block_group->lock);
+                if (!block_group->ro) {
+                        spin_unlock(&block_group->lock);
+                        continue;
+                }
+                if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                          BTRFS_BLOCK_GROUP_RAID10 |
+                                          BTRFS_BLOCK_GROUP_DUP))
+                        factor = 2;
+                else
+                        factor = 1;
+                free_bytes += (block_group->key.offset -
+                               btrfs_block_group_used(&block_group->item)) *
+                               factor;
+                spin_unlock(&block_group->lock);
+        }
+        return free_bytes;
+}
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+        int i;
+        u64 free_bytes = 0;
+        spin_lock(&sinfo->lock);
+        for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+                if (!list_empty(&sinfo->block_groups[i]))
+                        free_bytes += __btrfs_get_ro_block_group_free_space(
+                                                &sinfo->block_groups[i]);
+        spin_unlock(&sinfo->lock);
+        return free_bytes;
+}
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
 {
@@ -8092,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                u64 min_free = btrfs_block_group_used(&block_group->item);
-                u64 dev_offset, max_avail;
+                u64 dev_offset;
                /*
                 * check to make sure we can actually find a chunk with enough
@@ -8100,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 */
                if (device->total_bytes > device->bytes_used + min_free) {
                        ret = find_free_dev_extent(NULL, device, min_free,
-                                                   &dev_offset, &max_avail);
+                                                   &dev_offset, NULL);
                        if (!ret)
                                break;
                        ret = -1;
@@ -8584,3 +8641,14 @@ out:
        btrfs_free_path(path);
        return ret;
 }
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+        return unpin_extent_range(root, start, end);
+}
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                               u64 num_bytes)
+{
+        return btrfs_discard_extent(root, bytenr, num_bytes);
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e86b9f36507..2e993cf1766e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                BUG_ON(extent_map_end(em) <= cur);
                BUG_ON(end < cur);
-                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        this_bio_flag = EXTENT_BIO_COMPRESSED;
+                        extent_set_compress_type(&this_bio_flag,
+                                                 em->compress_type);
+                }
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
@@ -3072,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 #endif
        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+        if (eb == NULL)
+                return NULL;
        eb->start = start;
        eb->len = len;
        spin_lock_init(&eb->lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4183c8178f01..7083cfafd061 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
-/* flags for bio submission */
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_FLAG_SHIFT 16
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
        wait_queue_head_t lock_wq;
 };
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+                                            int compress_type)
+{
+        *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
+static inline int extent_compress_type(unsigned long bio_flags)
+{
+        return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
+}
 struct extent_map_tree;
 static inline struct extent_state *extent_state_next(struct extent_state *state)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff66..b0e1fce12530 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include "ctree.h"
 #include "extent_map.h"
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
                return em;
        em->in_tree = 0;
        em->flags = 0;
+        em->compress_type = BTRFS_COMPRESS_NONE;
        atomic_set(&em->refs, 1);
        return em;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..28b44dbd1e35 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
        unsigned long flags;
        struct block_device *bdev;
        atomic_t refs;
-        int in_tree;
+        unsigned int in_tree:1;
+        unsigned int compress_type:4;
 };
 struct extent_map_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763b..c800d58f3013 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -224,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->bdev = em->bdev;
                        split->flags = flags;
+                        split->compress_type = em->compress_type;
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
                        free_extent_map(split);
@@ -238,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->len = em->start + em->len - (start + len);
                        split->bdev = em->bdev;
                        split->flags = flags;
+                        split->compress_type = em->compress_type;
                        if (compressed) {
                                split->block_len = em->block_len;
@@ -890,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        if (err)
                goto out;
+        /*
+         * If BTRFS flips readonly due to some impossible error
+         * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+         * although we have opened a file as writable, we have
+         * to stop this write operation to ensure FS consistency.
+         */
+        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                err = -EROFS;
+                goto out;
+        }
        file_update_time(file);
        BTRFS_I(inode)->sequence++;
@@ -1237,6 +1251,117 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
        return 0;
 }
+static long btrfs_fallocate(struct file *file, int mode,
+                            loff_t offset, loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct extent_state *cached_state = NULL;
+        u64 cur_offset;
+        u64 last_byte;
+        u64 alloc_start;
+        u64 alloc_end;
+        u64 alloc_hint = 0;
+        u64 locked_end;
+        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        struct extent_map *em;
+        int ret;
+        alloc_start = offset & ~mask;
+        alloc_end =  (offset + len + mask) & ~mask;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, alloc_end);
+        if (ret)
+                goto out;
+        if (alloc_start > inode->i_size) {
+                ret = btrfs_cont_expand(inode, alloc_start);
+                if (ret)
+                        goto out;
+        }
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+        if (ret)
+                goto out;
+        locked_end = alloc_end - 1;
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                /* the extent lock is ordered inside the running
+                 * transaction
+                 */
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+                                 locked_end, 0, &cached_state, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            alloc_end - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > alloc_start &&
+                    ordered->file_offset < alloc_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                             alloc_start, locked_end,
+                                             &cached_state, GFP_NOFS);
+                        /*
+                         * we can't wait on the range with the transaction
+                         * running or with the extent lock held
+                         */
+                        btrfs_wait_ordered_range(inode, alloc_start,
+                                                 alloc_end - alloc_start);
+                } else {
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+        }
+        cur_offset = alloc_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                      alloc_end - cur_offset, 0);
+                BUG_ON(IS_ERR(em) || !em);
+                last_byte = min(extent_map_end(em), alloc_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (em->block_start == EXTENT_MAP_HOLE ||
+                    (cur_offset >= inode->i_size &&
+                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+                                                        last_byte - cur_offset,
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
+                        if (ret < 0) {
+                                free_extent_map(em);
+                                break;
+                        }
+                }
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (cur_offset >= alloc_end) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                             &cached_state, GFP_NOFS);
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -1248,6 +1373,7 @@ const struct file_operations btrfs_file_operations = {
        .open           = generic_file_open,
        .release        = btrfs_release_file,
        .fsync          = btrfs_sync_file,
+        .fallocate      = btrfs_fallocate,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0ff46a47895..160b55b3e132 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        size_t cur_size = size;
        size_t datasize;
        unsigned long offset;
-        int use_compress = 0;
+        int compress_type = BTRFS_COMPRESS_NONE;
        if (compressed_size && compressed_pages) {
-                use_compress = 1;
+                compress_type = root->fs_info->compress_type;
                cur_size = compressed_size;
        }
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
        ptr = btrfs_file_extent_inline_start(ei);
-        if (use_compress) {
+        if (compress_type != BTRFS_COMPRESS_NONE) {
                struct page *cpage;
                int i = 0;
                while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                        compressed_size -= cur_size;
                }
                btrfs_set_file_extent_compression(leaf, ei,
-                                                  BTRFS_COMPRESS_ZLIB);
+                                                  compress_type);
        } else {
                page = find_get_page(inode->i_mapping,
                                     start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
        u64 compressed_size;
        struct page **pages;
        unsigned long nr_pages;
+        int compress_type;
        struct list_head list;
 };
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
                                     u64 start, u64 ram_size,
                                     u64 compressed_size,
                                     struct page **pages,
-                                     unsigned long nr_pages)
+                                     unsigned long nr_pages,
+                                     int compress_type)
 {
        struct async_extent *async_extent;
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
        async_extent->compressed_size = compressed_size;
        async_extent->pages = pages;
        async_extent->nr_pages = nr_pages;
+        async_extent->compress_type = compress_type;
        list_add_tail(&async_extent->list, &cow->extents);
        return 0;
 }
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
        unsigned long max_uncompressed = 128 * 1024;
        int i;
        int will_compress;
+        int compress_type = root->fs_info->compress_type;
        actual_end = min_t(u64, isize, end + 1);
 again:
@@ -381,12 +385,16 @@ again:
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
-                ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+                if (BTRFS_I(inode)->force_compress)
-                                                total_compressed, pages,
+                        compress_type = BTRFS_I(inode)->force_compress;
-                                                nr_pages, &nr_pages_ret,
-                                                &total_in,
+                ret = btrfs_compress_pages(compress_type,
-                                                &total_compressed,
+                                           inode->i_mapping, start,
-                                                max_compressed);
+                                           total_compressed, pages,
+                                           nr_pages, &nr_pages_ret,
+                                           &total_in,
+                                           &total_compressed,
+                                           max_compressed);
                if (!ret) {
                        unsigned long offset = total_compressed &
@@ -493,7 +501,8 @@ again:
                 * and will submit them to the elevator.
                 */
                add_async_extent(async_cow, start, num_bytes,
-                                 total_compressed, pages, nr_pages_ret);
+                                 total_compressed, pages, nr_pages_ret,
+                                 compress_type);
                if (start + num_bytes < end) {
                        start += num_bytes;
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
                        __set_page_dirty_nobuffers(locked_page);
                        /* unlocked later on in the async handlers */
                }
-                add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+                add_async_extent(async_cow, start, end - start + 1,
+                                 0, NULL, 0, BTRFS_COMPRESS_NONE);
                *num_added += 1;
        }
@@ -640,6 +650,7 @@ retry:
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
+                em->compress_type = async_extent->compress_type;
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -656,11 +667,13 @@ retry:
                                                async_extent->ram_size - 1, 0);
                }
-                ret = btrfs_add_ordered_extent(inode, async_extent->start,
+                ret = btrfs_add_ordered_extent_compress(inode,
-                                               ins.objectid,
+                                                async_extent->start,
-                                               async_extent->ram_size,
+                                                ins.objectid,
-                                               ins.offset,
+                                                async_extent->ram_size,
-                                               BTRFS_ORDERED_COMPRESSED);
+                                                ins.offset,
+                                                BTRFS_ORDERED_COMPRESSED,
+                                                async_extent->compress_type);
                BUG_ON(ret);
                /*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
-        int compressed = 0;
+        int compress_type = 0;
        int ret;
        bool nolock = false;
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-                compressed = 1;
+                compress_type = ordered_extent->compress_type;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-                BUG_ON(compressed);
+                BUG_ON(compress_type);
                ret = btrfs_mark_extent_written(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->disk_len,
                                                ordered_extent->len,
                                                ordered_extent->len,
-                                                compressed, 0, 0,
+                                                compress_type, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
                                   ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        logical = em->block_start;
                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                        extent_set_compress_type(&failrec->bio_flags,
+                                                 em->compress_type);
                }
                failrec->logical = logical;
                free_extent_map(em);
@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        int err;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        err = inode_change_ok(inode, attr);
        if (err)
                return err;
@@ -4084,8 +4103,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        int index;
        int ret;
-        d_set_d_op(dentry, &btrfs_dentry_operations);
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -4930,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
        size_t max_size;
        unsigned long inline_size;
        unsigned long ptr;
+        int compress_type;
        WARN_ON(pg_offset != 0);
+        compress_type = btrfs_file_extent_compression(leaf, item);
        max_size = btrfs_file_extent_ram_bytes(leaf, item);
        inline_size = btrfs_file_extent_inline_item_len(leaf,
                                        btrfs_item_nr(leaf, path->slots[0]));
@@ -4941,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
        read_extent_buffer(leaf, tmp, ptr, inline_size);
        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
-        ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+        ret = btrfs_decompress(compress_type, tmp, page,
-                                    inline_size, max_size);
+                               extent_offset, inline_size, max_size);
        if (ret) {
                char *kaddr = kmap_atomic(page, KM_USER0);
                unsigned long copy_size = min_t(u64,
@@ -4984,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_trans_handle *trans = NULL;
-        int compressed;
+        int compress_type;
 again:
        read_lock(&em_tree->lock);
@@ -5043,7 +5062,7 @@ again:
        found_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
-        compressed = btrfs_file_extent_compression(leaf, item);
+        compress_type = btrfs_file_extent_compression(leaf, item);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                extent_end = extent_start +
@@ -5089,8 +5108,9 @@ again:
                        em->block_start = EXTENT_MAP_HOLE;
                        goto insert;
                }
-                if (compressed) {
+                if (compress_type != BTRFS_COMPRESS_NONE) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                        em->compress_type = compress_type;
                        em->block_start = bytenr;
                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
                                                                         item);
@@ -5124,12 +5144,14 @@ again:
                em->len = (copy_size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
                em->orig_start = EXTENT_MAP_INLINE;
-                if (compressed)
+                if (compress_type) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                        em->compress_type = compress_type;
+                }
                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                if (create == 0 && !PageUptodate(page)) {
-                        if (btrfs_file_extent_compression(leaf, item) ==
+                        if (btrfs_file_extent_compression(leaf, item) !=
-                            BTRFS_COMPRESS_ZLIB) {
+                            BTRFS_COMPRESS_NONE) {
                                ret = uncompress_inline(path, inode, page,
                                                        pg_offset,
                                                        extent_offset, item);
@@ -6479,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->ordered_data_close = 0;
        ei->orphan_meta_reserved = 0;
        ei->dummy_inode = 0;
-        ei->force_compress = 0;
+        ei->force_compress = BTRFS_COMPRESS_NONE;
        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -7100,112 +7122,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
                                           min_size, actual_len, alloc_hint, trans);
 }
-static long btrfs_fallocate(struct inode *inode, int mode,
-                            loff_t offset, loff_t len)
-{
-        struct extent_state *cached_state = NULL;
-        u64 cur_offset;
-        u64 last_byte;
-        u64 alloc_start;
-        u64 alloc_end;
-        u64 alloc_hint = 0;
-        u64 locked_end;
-        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-        struct extent_map *em;
-        int ret;
-        alloc_start = offset & ~mask;
-        alloc_end =  (offset + len + mask) & ~mask;
-        /*
-         * wait for ordered IO before we have any locks.  We'll loop again
-         * below with the locks held.
-         */
-        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-        mutex_lock(&inode->i_mutex);
-        ret = inode_newsize_ok(inode, alloc_end);
-        if (ret)
-                goto out;
-        if (alloc_start > inode->i_size) {
-                ret = btrfs_cont_expand(inode, alloc_start);
-                if (ret)
-                        goto out;
-        }
-        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-        if (ret)
-                goto out;
-        locked_end = alloc_end - 1;
-        while (1) {
-                struct btrfs_ordered_extent *ordered;
-                /* the extent lock is ordered inside the running
-                 * transaction
-                 */
-                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-                                 locked_end, 0, &cached_state, GFP_NOFS);
-                ordered = btrfs_lookup_first_ordered_extent(inode,
-                                                            alloc_end - 1);
-                if (ordered &&
-                    ordered->file_offset + ordered->len > alloc_start &&
-                    ordered->file_offset < alloc_end) {
-                        btrfs_put_ordered_extent(ordered);
-                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                             alloc_start, locked_end,
-                                             &cached_state, GFP_NOFS);
-                        /*
-                         * we can't wait on the range with the transaction
-                         * running or with the extent lock held
-                         */
-                        btrfs_wait_ordered_range(inode, alloc_start,
-                                                 alloc_end - alloc_start);
-                } else {
-                        if (ordered)
-                                btrfs_put_ordered_extent(ordered);
-                        break;
-                }
-        }
-        cur_offset = alloc_start;
-        while (1) {
-                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-                                      alloc_end - cur_offset, 0);
-                BUG_ON(IS_ERR(em) || !em);
-                last_byte = min(extent_map_end(em), alloc_end);
-                last_byte = (last_byte + mask) & ~mask;
-                if (em->block_start == EXTENT_MAP_HOLE ||
-                    (cur_offset >= inode->i_size &&
-                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                        last_byte - cur_offset,
-                                                        1 << inode->i_blkbits,
-                                                        offset + len,
-                                                        &alloc_hint);
-                        if (ret < 0) {
-                                free_extent_map(em);
-                                break;
-                        }
-                }
-                free_extent_map(em);
-                cur_offset = last_byte;
-                if (cur_offset >= alloc_end) {
-                        ret = 0;
-                        break;
-                }
-        }
-        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-out:
-        mutex_unlock(&inode->i_mutex);
-        return ret;
-}
 static int btrfs_set_page_dirty(struct page *page)
 {
        return __set_page_dirty_nobuffers(page);
@@ -7213,6 +7129,10 @@ static int btrfs_set_page_dirty(struct page *page)
 static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+                return -EROFS;
        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
        return generic_permission(inode, mask, flags, btrfs_check_acl);
@@ -7308,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
-        .fallocate      = btrfs_fallocate,
        .fiemap         = btrfs_fiemap,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7ea..a506a22b522a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        unsigned int flags, oldflags;
        int ret;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
@@ -360,7 +363,8 @@ fail:
 }
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-                           char *name, int namelen, u64 *async_transid)
+                           char *name, int namelen, u64 *async_transid,
+                           bool readonly)
 {
        struct inode *inode;
        struct dentry *parent;
@@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        btrfs_init_block_rsv(&pending_snapshot->block_rsv);
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
+        pending_snapshot->readonly = readonly;
        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
        if (IS_ERR(trans)) {
@@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 static noinline int btrfs_mksubvol(struct path *parent,
                                   char *name, int namelen,
                                   struct btrfs_root *snap_src,
-                                   u64 *async_transid)
+                                   u64 *async_transid, bool readonly)
 {
        struct inode *dir  = parent->dentry->d_inode;
        struct dentry *dentry;
@@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
        if (snap_src) {
                error = create_snapshot(snap_src, dentry,
-                                        name, namelen, async_transid);
+                                        name, namelen, async_transid, readonly);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
                                      name, namelen, async_transid);
@@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
        struct page *page;
+        struct btrfs_super_block *disk_super;
        unsigned long last_index;
        unsigned long ra_pages = root->fs_info->bdi.ra_pages;
        unsigned long total_read = 0;
+        u64 features;
        u64 page_start;
        u64 page_end;
        u64 last_len = 0;
@@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
        u64 defrag_end = 0;
        unsigned long i;
        int ret;
+        int compress_type = BTRFS_COMPRESS_ZLIB;
+        if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+                if (range->compress_type > BTRFS_COMPRESS_TYPES)
+                        return -EINVAL;
+                if (range->compress_type)
+                        compress_type = range->compress_type;
+        }
        if (inode->i_size == 0)
                return 0;
@@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
                total_read++;
                mutex_lock(&inode->i_mutex);
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-                        BTRFS_I(inode)->force_compress = 1;
+                        BTRFS_I(inode)->force_compress = compress_type;
                ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
                if (ret)
@@ -781,10 +796,17 @@ loop_unlock:
                atomic_dec(&root->fs_info->async_submit_draining);
                mutex_lock(&inode->i_mutex);
-                BTRFS_I(inode)->force_compress = 0;
+                BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
                mutex_unlock(&inode->i_mutex);
        }
+        disk_super = &root->fs_info->super_copy;
+        features = btrfs_super_incompat_flags(disk_super);
+        if (range->compress_type == BTRFS_COMPRESS_LZO) {
+                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+                btrfs_set_super_incompat_flags(disk_super, features);
+        }
        return 0;
 err_reservations:
@@ -901,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                                                    char *name,
                                                    unsigned long fd,
                                                    int subvol,
-                                                    u64 *transid)
+                                                    u64 *transid,
+                                                    bool readonly)
 {
        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct file *src_file;
@@ -919,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
        if (subvol) {
                ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                     NULL, transid);
+                                     NULL, transid, readonly);
        } else {
                struct inode *src_inode;
                src_file = fget(fd);
@@ -938,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                }
                ret = btrfs_mksubvol(&file->f_path, name, namelen,
                                     BTRFS_I(src_inode)->root,
-                                     transid);
+                                     transid, readonly);
                fput(src_file);
        }
 out:
@@ -946,58 +969,139 @@ out:
 }
 static noinline int btrfs_ioctl_snap_create(struct file *file,
-                                            void __user *arg, int subvol,
+                                            void __user *arg, int subvol)
-                                            int v2)
 {
-        struct btrfs_ioctl_vol_args *vol_args = NULL;
+        struct btrfs_ioctl_vol_args *vol_args;
-        struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
-        char *name;
-        u64 fd;
        int ret;
-        if (v2) {
+        vol_args = memdup_user(arg, sizeof(*vol_args));
-                u64 transid = 0;
+        if (IS_ERR(vol_args))
-                u64 *ptr = NULL;
+                return PTR_ERR(vol_args);
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-                vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
+        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
-                if (IS_ERR(vol_args_v2))
+                                              vol_args->fd, subvol,
-                        return PTR_ERR(vol_args_v2);
+                                              NULL, false);
-                if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
+        kfree(vol_args);
-                        ret = -EINVAL;
+        return ret;
-                        goto out;
+}
-                }
-                name = vol_args_v2->name;
-                fd = vol_args_v2->fd;
-                vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
-                if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
-                        ptr = &transid;
+                                               void __user *arg, int subvol)
+{
+        struct btrfs_ioctl_vol_args_v2 *vol_args;
+        int ret;
+        u64 transid = 0;
+        u64 *ptr = NULL;
+        bool readonly = false;
-                ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+        vol_args = memdup_user(arg, sizeof(*vol_args));
-                                                      subvol, ptr);
+        if (IS_ERR(vol_args))
+                return PTR_ERR(vol_args);
+        vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
-                if (ret == 0 && ptr &&
+        if (vol_args->flags &
-                    copy_to_user(arg +
+            ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
-                                 offsetof(struct btrfs_ioctl_vol_args_v2,
+                ret = -EOPNOTSUPP;
-                                          transid), ptr, sizeof(*ptr)))
+                goto out;
-                        ret = -EFAULT;
-        } else {
-                vol_args = memdup_user(arg, sizeof(*vol_args));
-                if (IS_ERR(vol_args))
-                        return PTR_ERR(vol_args);
-                name = vol_args->name;
-                fd = vol_args->fd;
-                vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-                ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-                                                      subvol, NULL);
        }
+        if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+                ptr = &transid;
+        if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+                readonly = true;
+        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+                                              vol_args->fd, subvol,
+                                              ptr, readonly);
+        if (ret == 0 && ptr &&
+            copy_to_user(arg +
+                         offsetof(struct btrfs_ioctl_vol_args_v2,
+                                  transid), ptr, sizeof(*ptr)))
+                ret = -EFAULT;
 out:
        kfree(vol_args);
-        kfree(vol_args_v2);
+        return ret;
+}
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+                                                void __user *arg)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        u64 flags = 0;
+        if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+                return -EINVAL;
+        down_read(&root->fs_info->subvol_sem);
+        if (btrfs_root_readonly(root))
+                flags |= BTRFS_SUBVOL_RDONLY;
+        up_read(&root->fs_info->subvol_sem);
+        if (copy_to_user(arg, &flags, sizeof(flags)))
+                ret = -EFAULT;
+        return ret;
+}
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+                                              void __user *arg)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        u64 root_flags;
+        u64 flags;
+        int ret = 0;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+                return -EINVAL;
+        if (copy_from_user(&flags, arg, sizeof(flags)))
+                return -EFAULT;
+        if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+                return -EINVAL;
+        if (flags & ~BTRFS_SUBVOL_RDONLY)
+                return -EOPNOTSUPP;
+        down_write(&root->fs_info->subvol_sem);
+        /* nothing to do */
+        if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+                goto out;
+        root_flags = btrfs_root_flags(&root->root_item);
+        if (flags & BTRFS_SUBVOL_RDONLY)
+                btrfs_set_root_flags(&root->root_item,
+                                     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+        else
+                btrfs_set_root_flags(&root->root_item,
+                                     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+        trans = btrfs_start_transaction(root, 1);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out_reset;
+        }
+        ret = btrfs_update_root(trans, root,
+                                &root->root_key, &root->root_item);
+        btrfs_commit_transaction(trans, root);
+out_reset:
+        if (ret)
+                btrfs_set_root_flags(&root->root_item, root_flags);
+out:
+        up_write(&root->fs_info->subvol_sem);
        return ret;
 }
@@ -1509,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
        struct btrfs_ioctl_defrag_range_args *range;
        int ret;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                return ret;
@@ -1637,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
                return -EINVAL;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                return ret;
@@ -1958,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
        if (file->private_data)
                goto out;
+        ret = -EROFS;
+        if (btrfs_root_readonly(root))
+                goto out;
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                goto out;
@@ -2257,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
        case FS_IOC_GETVERSION:
                return btrfs_ioctl_getversion(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
-                return btrfs_ioctl_snap_create(file, argp, 0, 0);
+                return btrfs_ioctl_snap_create(file, argp, 0);
        case BTRFS_IOC_SNAP_CREATE_V2:
-                return btrfs_ioctl_snap_create(file, argp, 0, 1);
+                return btrfs_ioctl_snap_create_v2(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
-                return btrfs_ioctl_snap_create(file, argp, 1, 0);
+                return btrfs_ioctl_snap_create(file, argp, 1);
        case BTRFS_IOC_SNAP_DESTROY:
                return btrfs_ioctl_snap_destroy(file, argp);
+        case BTRFS_IOC_SUBVOL_GETFLAGS:
+                return btrfs_ioctl_subvol_getflags(file, argp);
+        case BTRFS_IOC_SUBVOL_SETFLAGS:
+                return btrfs_ioctl_subvol_setflags(file, argp);
        case BTRFS_IOC_DEFAULT_SUBVOL:
                return btrfs_ioctl_default_subvol(file, argp);
        case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c344d12c646b..8fb382167b13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
 };
 #define BTRFS_SUBVOL_CREATE_ASYNC       (1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY             (1ULL << 1)
 #define BTRFS_SUBVOL_NAME_MAX 4039
 struct btrfs_ioctl_vol_args_v2 {
@@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
         */
        __u32 extent_thresh;
+        /*
+         * which compression method to use if turning on compression
+         * for this defrag operation.  If unspecified, zlib will
+         * be used
+         */
+        __u32 compress_type;
        /* spare for later */
-        __u32 unused[5];
+        __u32 unused[4];
 };
 struct btrfs_ioctl_space_info {
@@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args {
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
                                   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
 #endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..cc9b450399df
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+#define LZO_LEN 4
+struct workspace {
+        void *mem;
+        void *buf;      /* where compressed data goes */
+        void *cbuf;     /* where decompressed data goes */
+        struct list_head list;
+};
+static void lzo_free_workspace(struct list_head *ws)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        vfree(workspace->buf);
+        vfree(workspace->cbuf);
+        vfree(workspace->mem);
+        kfree(workspace);
+}
+static struct list_head *lzo_alloc_workspace(void)
+{
+        struct workspace *workspace;
+        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+        if (!workspace)
+                return ERR_PTR(-ENOMEM);
+        workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+        workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+        workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+        if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+                goto fail;
+        INIT_LIST_HEAD(&workspace->list);
+        return &workspace->list;
+fail:
+        lzo_free_workspace(&workspace->list);
+        return ERR_PTR(-ENOMEM);
+}
+static inline void write_compress_length(char *buf, size_t len)
+{
+        __le32 dlen;
+        dlen = cpu_to_le32(len);
+        memcpy(buf, &dlen, LZO_LEN);
+}
+static inline size_t read_compress_length(char *buf)
+{
+        __le32 dlen;
+        memcpy(&dlen, buf, LZO_LEN);
+        return le32_to_cpu(dlen);
+}
+static int lzo_compress_pages(struct list_head *ws,
+                              struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        int ret = 0;
+        char *data_in;
+        char *cpage_out;
+        int nr_pages = 0;
+        struct page *in_page = NULL;
+        struct page *out_page = NULL;
+        unsigned long bytes_left;
+        size_t in_len;
+        size_t out_len;
+        char *buf;
+        unsigned long tot_in = 0;
+        unsigned long tot_out = 0;
+        unsigned long pg_bytes_left;
+        unsigned long out_offset;
+        unsigned long bytes;
+        *out_pages = 0;
+        *total_out = 0;
+        *total_in = 0;
+        in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+        data_in = kmap(in_page);
+        /*
+         * store the size of all chunks of compressed data in
+         * the first 4 bytes
+         */
+        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        if (out_page == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        cpage_out = kmap(out_page);
+        out_offset = LZO_LEN;
+        tot_out = LZO_LEN;
+        pages[0] = out_page;
+        nr_pages = 1;
+        pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+        /* compress at most one page of data each time */
+        in_len = min(len, PAGE_CACHE_SIZE);
+        while (tot_in < len) {
+                ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+                                       &out_len, workspace->mem);
+                if (ret != LZO_E_OK) {
+                        printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                               ret);
+                        ret = -1;
+                        goto out;
+                }
+                /* store the size of this chunk of compressed data */
+                write_compress_length(cpage_out + out_offset, out_len);
+                tot_out += LZO_LEN;
+                out_offset += LZO_LEN;
+                pg_bytes_left -= LZO_LEN;
+                tot_in += in_len;
+                tot_out += out_len;
+                /* copy bytes from the working buffer into the pages */
+                buf = workspace->cbuf;
+                while (out_len) {
+                        bytes = min_t(unsigned long, pg_bytes_left, out_len);
+                        memcpy(cpage_out + out_offset, buf, bytes);
+                        out_len -= bytes;
+                        pg_bytes_left -= bytes;
+                        buf += bytes;
+                        out_offset += bytes;
+                        /*
+                         * we need another page for writing out.
+                         *
+                         * Note if there's less than 4 bytes left, we just
+                         * skip to a new page.
+                         */
+                        if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+                            pg_bytes_left == 0) {
+                                if (pg_bytes_left) {
+                                        memset(cpage_out + out_offset, 0,
+                                               pg_bytes_left);
+                                        tot_out += pg_bytes_left;
+                                }
+                                /* we're done, don't allocate new page */
+                                if (out_len == 0 && tot_in >= len)
+                                        break;
+                                kunmap(out_page);
+                                if (nr_pages == nr_dest_pages) {
+                                        out_page = NULL;
+                                        ret = -1;
+                                        goto out;
+                                }
+                                out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                                if (out_page == NULL) {
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
+                                cpage_out = kmap(out_page);
+                                pages[nr_pages++] = out_page;
+                                pg_bytes_left = PAGE_CACHE_SIZE;
+                                out_offset = 0;
+                        }
+                }
+                /* we're making it bigger, give up */
+                if (tot_in > 8192 && tot_in < tot_out)
+                        goto out;
+                /* we're all done */
+                if (tot_in >= len)
+                        break;
+                if (tot_out > max_out)
+                        break;
+                bytes_left = len - tot_in;
+                kunmap(in_page);
+                page_cache_release(in_page);
+                start += PAGE_CACHE_SIZE;
+                in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+                data_in = kmap(in_page);
+                in_len = min(bytes_left, PAGE_CACHE_SIZE);
+        }
+        if (tot_out > tot_in)
+                goto out;
+        /* store the size of all chunks of compressed data */
+        cpage_out = kmap(pages[0]);
+        write_compress_length(cpage_out, tot_out);
+        kunmap(pages[0]);
+        ret = 0;
+        *total_out = tot_out;
+        *total_in = tot_in;
+out:
+        *out_pages = nr_pages;
+        if (out_page)
+                kunmap(out_page);
+        if (in_page) {
+                kunmap(in_page);
+                page_cache_release(in_page);
+        }
+        return ret;
+}
+static int lzo_decompress_biovec(struct list_head *ws,
+                                 struct page **pages_in,
+                                 u64 disk_start,
+                                 struct bio_vec *bvec,
+                                 int vcnt,
+                                 size_t srclen)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        int ret = 0, ret2;
+        char *data_in;
+        unsigned long page_in_index = 0;
+        unsigned long page_out_index = 0;
+        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                        PAGE_CACHE_SIZE;
+        unsigned long buf_start;
+        unsigned long buf_offset = 0;
+        unsigned long bytes;
+        unsigned long working_bytes;
+        unsigned long pg_offset;
+        size_t in_len;
+        size_t out_len;
+        unsigned long in_offset;
+        unsigned long in_page_bytes_left;
+        unsigned long tot_in;
+        unsigned long tot_out;
+        unsigned long tot_len;
+        char *buf;
+        data_in = kmap(pages_in[0]);
+        tot_len = read_compress_length(data_in);
+        tot_in = LZO_LEN;
+        in_offset = LZO_LEN;
+        tot_len = min_t(size_t, srclen, tot_len);
+        in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+        tot_out = 0;
+        pg_offset = 0;
+        while (tot_in < tot_len) {
+                in_len = read_compress_length(data_in + in_offset);
+                in_page_bytes_left -= LZO_LEN;
+                in_offset += LZO_LEN;
+                tot_in += LZO_LEN;
+                tot_in += in_len;
+                working_bytes = in_len;
+                /* fast path: avoid using the working buffer */
+                if (in_page_bytes_left >= in_len) {
+                        buf = data_in + in_offset;
+                        bytes = in_len;
+                        goto cont;
+                }
+                /* copy bytes from the pages into the working buffer */
+                buf = workspace->cbuf;
+                buf_offset = 0;
+                while (working_bytes) {
+                        bytes = min(working_bytes, in_page_bytes_left);
+                        memcpy(buf + buf_offset, data_in + in_offset, bytes);
+                        buf_offset += bytes;
+cont:
+                        working_bytes -= bytes;
+                        in_page_bytes_left -= bytes;
+                        in_offset += bytes;
+                        /* check if we need to pick another page */
+                        if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+                            || in_page_bytes_left == 0) {
+                                tot_in += in_page_bytes_left;
+                                if (working_bytes == 0 && tot_in >= tot_len)
+                                        break;
+                                kunmap(pages_in[page_in_index]);
+                                page_in_index++;
+                                if (page_in_index >= total_pages_in) {
+                                        ret = -1;
+                                        data_in = NULL;
+                                        goto done;
+                                }
+                                data_in = kmap(pages_in[page_in_index]);
+                                in_page_bytes_left = PAGE_CACHE_SIZE;
+                                in_offset = 0;
+                        }
+                }
+                out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+                ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+                                            &out_len);
+                if (ret != LZO_E_OK) {
+                        printk(KERN_WARNING "btrfs decompress failed\n");
+                        ret = -1;
+                        break;
+                }
+                buf_start = tot_out;
+                tot_out += out_len;
+                ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+                                                 tot_out, disk_start,
+                                                 bvec, vcnt,
+                                                 &page_out_index, &pg_offset);
+                if (ret2 == 0)
+                        break;
+        }
+done:
+        if (data_in)
+                kunmap(pages_in[page_in_index]);
+        return ret;
+}
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        size_t in_len;
+        size_t out_len;
+        size_t tot_len;
+        int ret = 0;
+        char *kaddr;
+        unsigned long bytes;
+        BUG_ON(srclen < LZO_LEN);
+        tot_len = read_compress_length(data_in);
+        data_in += LZO_LEN;
+        in_len = read_compress_length(data_in);
+        data_in += LZO_LEN;
+        out_len = PAGE_CACHE_SIZE;
+        ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+        if (ret != LZO_E_OK) {
+                printk(KERN_WARNING "btrfs decompress failed!\n");
+                ret = -1;
+                goto out;
+        }
+        if (out_len < start_byte) {
+                ret = -1;
+                goto out;
+        }
+        bytes = min_t(unsigned long, destlen, out_len - start_byte);
+        kaddr = kmap_atomic(dest_page, KM_USER0);
+        memcpy(kaddr, workspace->buf + start_byte, bytes);
+        kunmap_atomic(kaddr, KM_USER0);
+out:
+        return ret;
+}
+struct btrfs_compress_op btrfs_lzo_compress = {
+        .alloc_workspace        = lzo_alloc_workspace,
+        .free_workspace         = lzo_free_workspace,
+        .compress_pages         = lzo_compress_pages,
+        .decompress_biovec      = lzo_decompress_biovec,
+        .decompress             = lzo_decompress,
+};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ae7737e352c9..2b61e1ddcd99 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                                      u64 start, u64 len, u64 disk_len,
-                                      int type, int dio)
+                                      int type, int dio, int compress_type)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->disk_len = disk_len;
        entry->bytes_left = len;
        entry->inode = inode;
+        entry->compress_type = compress_type;
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type)
 {
        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-                                          disk_len, type, 0);
+                                          disk_len, type, 0,
+                                          BTRFS_COMPRESS_NONE);
 }
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
                                 u64 start, u64 len, u64 disk_len, int type)
 {
        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-                                          disk_len, type, 1);
+                                          disk_len, type, 1,
+                                          BTRFS_COMPRESS_NONE);
+}
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int compress_type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 0,
+                                          compress_type);
 }
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 61dca83119dd..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
        /* flags (described above) */
        unsigned long flags;
+        /* compression algorithm */
+        int compress_type;
        /* reference count */
        atomic_t refs;
@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
                                 u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int compress_type);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 883c6fa1367e..b2130c46fdb5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
 static const struct super_operations btrfs_super_ops;
+static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+                                      char nbuf[16])
+{
+        char *errstr = NULL;
+        switch (errno) {
+        case -EIO:
+                errstr = "IO failure";
+                break;
+        case -ENOMEM:
+                errstr = "Out of memory";
+                break;
+        case -EROFS:
+                errstr = "Readonly filesystem";
+                break;
+        default:
+                if (nbuf) {
+                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+                                errstr = nbuf;
+                }
+                break;
+        }
+        return errstr;
+}
+static void __save_error_info(struct btrfs_fs_info *fs_info)
+{
+        /*
+         * today we only save the error info into ram.  Long term we'll
+         * also send it down to the disk
+         */
+        fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+}
+/* NOTE:
+ *      We move write_super stuff at umount in order to avoid deadlock
+ *      for umount hold all lock.
+ */
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+        __save_error_info(fs_info);
+}
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+        struct super_block *sb = fs_info->sb;
+        if (sb->s_flags & MS_RDONLY)
+                return;
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                sb->s_flags |= MS_RDONLY;
+                printk(KERN_INFO "btrfs is forced readonly\n");
+        }
+}
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                     unsigned int line, int errno)
+{
+        struct super_block *sb = fs_info->sb;
+        char nbuf[16];
+        const char *errstr;
+        /*
+         * Special case: if the error is EROFS, and we're already
+         * under MS_RDONLY, then it is safe here.
+         */
+        if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+                return;
+        errstr = btrfs_decode_error(fs_info, errno, nbuf);
+        printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+                sb->s_id, function, line, errstr);
+        save_error_info(fs_info);
+        btrfs_handle_error(fs_info);
+}
 static void btrfs_put_super(struct super_block *sb)
 {
        struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
        Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-        Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
-        Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
+        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-        Opt_user_subvol_rm_allowed,
+        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
 };
 static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
        {Opt_compress, "compress"},
+        {Opt_compress_type, "compress=%s"},
        {Opt_compress_force, "compress-force"},
+        {Opt_compress_force_type, "compress-force=%s"},
        {Opt_ssd, "ssd"},
        {Opt_ssd_spread, "ssd_spread"},
        {Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        char *p, *num, *orig;
        int intarg;
        int ret = 0;
+        char *compress_type;
+        bool compress_force = false;
        if (!options)
                return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
-                case Opt_compress:
-                        printk(KERN_INFO "btrfs: use compression\n");
-                        btrfs_set_opt(info->mount_opt, COMPRESS);
-                        break;
                case Opt_compress_force:
-                        printk(KERN_INFO "btrfs: forcing compression\n");
+                case Opt_compress_force_type:
-                        btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                        compress_force = true;
+                case Opt_compress:
+                case Opt_compress_type:
+                        if (token == Opt_compress ||
+                            token == Opt_compress_force ||
+                            strcmp(args[0].from, "zlib") == 0) {
+                                compress_type = "zlib";
+                                info->compress_type = BTRFS_COMPRESS_ZLIB;
+                        } else if (strcmp(args[0].from, "lzo") == 0) {
+                                compress_type = "lzo";
+                                info->compress_type = BTRFS_COMPRESS_LZO;
+                        } else {
+                                ret = -EINVAL;
+                                goto out;
+                        }
                        btrfs_set_opt(info->mount_opt, COMPRESS);
+                        if (compress_force) {
+                                btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                                pr_info("btrfs: force %s compression\n",
+                                        compress_type);
+                        } else
+                                pr_info("btrfs: use %s compression\n",
+                                        compress_type);
                        break;
                case Opt_ssd:
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -460,6 +566,7 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_magic = BTRFS_SUPER_MAGIC;
        sb->s_op = &btrfs_super_ops;
+        sb->s_d_op = &btrfs_dentry_operations;
        sb->s_export_op = &btrfs_export_ops;
        sb->s_xattr = btrfs_xattr_handlers;
        sb->s_time_gran = 1;
@@ -752,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_device_info *devices_info;
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        struct btrfs_device *device;
+        u64 skip_space;
+        u64 type;
+        u64 avail_space;
+        u64 used_space;
+        u64 min_stripe_size;
+        int min_stripes = 1;
+        int i = 0, nr_devices;
+        int ret;
+        nr_devices = fs_info->fs_devices->rw_devices;
+        BUG_ON(!nr_devices);
+        devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+                               GFP_NOFS);
+        if (!devices_info)
+                return -ENOMEM;
+        /* calc min stripe number for data space alloction */
+        type = btrfs_get_alloc_profile(root, 1);
+        if (type & BTRFS_BLOCK_GROUP_RAID0)
+                min_stripes = 2;
+        else if (type & BTRFS_BLOCK_GROUP_RAID1)
+                min_stripes = 2;
+        else if (type & BTRFS_BLOCK_GROUP_RAID10)
+                min_stripes = 4;
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+        else
+                min_stripe_size = BTRFS_STRIPE_LEN;
+        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+                if (!device->in_fs_metadata)
+                        continue;
+                avail_space = device->total_bytes - device->bytes_used;
+                /* align with stripe_len */
+                do_div(avail_space, BTRFS_STRIPE_LEN);
+                avail_space *= BTRFS_STRIPE_LEN;
+                /*
+                 * In order to avoid overwritting the superblock on the drive,
+                 * btrfs starts at an offset of at least 1MB when doing chunk
+                 * allocation.
+                 */
+                skip_space = 1024 * 1024;
+                /* user can set the offset in fs_info->alloc_start. */
+                if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+                    device->total_bytes)
+                        skip_space = max(fs_info->alloc_start, skip_space);
+                /*
+                 * btrfs can not use the free space in [0, skip_space - 1],
+                 * we must subtract it from the total. In order to implement
+                 * it, we account the used space in this range first.
+                 */
+                ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+                                                     &used_space);
+                if (ret) {
+                        kfree(devices_info);
+                        return ret;
+                }
+                /* calc the free space in [0, skip_space - 1] */
+                skip_space -= used_space;
+                /*
+                 * we can use the free space in [0, skip_space - 1], subtract
+                 * it from the total.
+                 */
+                if (avail_space && avail_space >= skip_space)
+                        avail_space -= skip_space;
+                else
+                        avail_space = 0;
+                if (avail_space < min_stripe_size)
+                        continue;
+                devices_info[i].dev = device;
+                devices_info[i].max_avail = avail_space;
+                i++;
+        }
+        nr_devices = i;
+        btrfs_descending_sort_devices(devices_info, nr_devices);
+        i = nr_devices - 1;
+        avail_space = 0;
+        while (nr_devices >= min_stripes) {
+                if (devices_info[i].max_avail >= min_stripe_size) {
+                        int j;
+                        u64 alloc_size;
+                        avail_space += devices_info[i].max_avail * min_stripes;
+                        alloc_size = devices_info[i].max_avail;
+                        for (j = i + 1 - min_stripes; j <= i; j++)
+                                devices_info[j].max_avail -= alloc_size;
+                }
+                i--;
+                nr_devices--;
+        }
+        kfree(devices_info);
+        *free_bytes = avail_space;
+        return 0;
+}
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -759,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-        u64 total_used_data = 0;
+        u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
+        int ret;
+        /* holding chunk_muext to avoid allocating new chunks */
+        mutex_lock(&root->fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
-                if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
+                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
-                                    BTRFS_BLOCK_GROUP_SYSTEM))
+                        total_free_data += found->disk_total - found->disk_used;
-                        total_used_data += found->disk_total;
+                        total_free_data -=
-                else
+                                btrfs_account_ro_block_groups_free_space(found);
-                        total_used_data += found->disk_used;
+                }
                total_used += found->disk_used;
        }
        rcu_read_unlock();
@@ -777,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-        buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
+        buf->f_bavail = total_free_data;
+        ret = btrfs_calc_avail_data_space(root, &total_free_data);
+        if (ret) {
+                mutex_unlock(&root->fs_info->chunk_mutex);
+                return ret;
+        }
+        buf->f_bavail += total_free_data;
+        buf->f_bavail = buf->f_bavail >> bits;
+        mutex_unlock(&root->fs_info->chunk_mutex);
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
@@ -896,10 +1136,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                return err;
-        err = btrfs_init_cachep();
+        err = btrfs_init_compress();
        if (err)
                goto free_sysfs;
+        err = btrfs_init_cachep();
+        if (err)
+                goto free_compress;
        err = extent_io_init();
        if (err)
                goto free_cachep;
@@ -927,6 +1171,8 @@ free_extent_io:
        extent_io_exit();
 free_cachep:
        btrfs_destroy_cachep();
+free_compress:
+        btrfs_exit_compress();
 free_sysfs:
        btrfs_exit_sysfs();
        return err;
@@ -941,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
        unregister_filesystem(&btrfs_fs_type);
        btrfs_exit_sysfs();
        btrfs_cleanup_fs_uuids();
-        btrfs_zlib_exit();
+        btrfs_exit_compress();
 }
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f50e931fc217..bae5c7b8bbe2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
        int ret;
+        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+                return ERR_PTR(-EROFS);
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
@@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        u64 to_reserve = 0;
        u64 index = 0;
        u64 objectid;
+        u64 root_flags;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
@@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+        root_flags = btrfs_root_flags(new_root_item);
+        if (pending->readonly)
+                root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+        else
+                root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+        btrfs_set_root_flags(new_root_item, root_flags);
        old = btrfs_lock_root_node(root);
        btrfs_cow_block(trans, root, old, NULL, 0, &old);
        btrfs_set_lock_blocking(old);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4ef..229a594cacd5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
        struct btrfs_block_rsv block_rsv;
        /* extra metadata reseration for relocation */
        int error;
+        bool readonly;
        struct list_head list;
 };
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6b9884507837..d158530233b7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/iocontext.h>
+#include <linux/capability.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -493,7 +494,7 @@ again:
                        continue;
                if (device->bdev) {
-                        close_bdev_exclusive(device->bdev, device->mode);
+                        blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
                        fs_devices->open_devices--;
                }
@@ -527,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (device->bdev) {
-                        close_bdev_exclusive(device->bdev, device->mode);
+                        blkdev_put(device->bdev, device->mode);
                        fs_devices->open_devices--;
                }
                if (device->writeable) {
@@ -584,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
        int seeding = 1;
        int ret = 0;
+        flags |= FMODE_EXCL;
        list_for_each_entry(device, head, dev_list) {
                if (device->bdev)
                        continue;
                if (!device->name)
                        continue;
-                bdev = open_bdev_exclusive(device->name, flags, holder);
+                bdev = blkdev_get_by_path(device->name, flags, holder);
                if (IS_ERR(bdev)) {
                        printk(KERN_INFO "open %s failed\n", device->name);
                        goto error;
@@ -598,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
-                if (!bh)
+                if (!bh) {
+                        ret = -EINVAL;
                        goto error_close;
+                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -642,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
                brelse(bh);
 error_close:
-                close_bdev_exclusive(bdev, FMODE_READ);
+                blkdev_put(bdev, flags);
 error:
                continue;
        }
@@ -688,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        mutex_lock(&uuid_mutex);
-        bdev = open_bdev_exclusive(path, flags, holder);
+        flags |= FMODE_EXCL;
+        bdev = blkdev_get_by_path(path, flags, holder);
        if (IS_ERR(bdev)) {
                ret = PTR_ERR(bdev);
@@ -700,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                goto error_close;
        bh = btrfs_read_dev_super(bdev);
        if (!bh) {
-                ret = -EIO;
+                ret = -EINVAL;
                goto error_close;
        }
        disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -720,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        brelse(bh);
 error_close:
-        close_bdev_exclusive(bdev, flags);
+        blkdev_put(bdev, flags);
 error:
        mutex_unlock(&uuid_mutex);
        return ret;
 }
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                   u64 end, u64 *length)
+{
+        struct btrfs_key key;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_dev_extent *dev_extent;
+        struct btrfs_path *path;
+        u64 extent_end;
+        int ret;
+        int slot;
+        struct extent_buffer *l;
+        *length = 0;
+        if (start >= device->total_bytes)
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        key.objectid = device->devid;
+        key.offset = start;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = btrfs_previous_item(root, path, key.objectid, key.type);
+                if (ret < 0)
+                        goto out;
+        }
+        while (1) {
+                l = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(l)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto out;
+                        break;
+                }
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid < device->devid)
+                        goto next;
+                if (key.objectid > device->devid)
+                        break;
+                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                        goto next;
+                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+                extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                  dev_extent);
+                if (key.offset <= start && extent_end > end) {
+                        *length = end - start + 1;
+                        break;
+                } else if (key.offset <= start && extent_end > start)
+                        *length += extent_end - start;
+                else if (key.offset > start && extent_end <= end)
+                        *length += extent_end - key.offset;
+                else if (key.offset > start && key.offset <= end) {
+                        *length += end - key.offset + 1;
+                        break;
+                } else if (key.offset > end)
+                        break;
+next:
+                path->slots[0]++;
+        }
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
 /*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans:      transaction handler
+ * @device:     the device which we search the free space in
+ * @num_bytes:  the size of the free space that we need
+ * @start:      store the start of the free space.
+ * @len:        the size of the free space. that we find, or the size of the max
+ *              free space if we don't find suitable free space
+ *
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
 */
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_device *device, u64 num_bytes,
-                         u64 *start, u64 *max_avail)
+                         u64 *start, u64 *len)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
-        struct btrfs_dev_extent *dev_extent = NULL;
+        struct btrfs_dev_extent *dev_extent;
        struct btrfs_path *path;
-        u64 hole_size = 0;
+        u64 hole_size;
-        u64 last_byte = 0;
+        u64 max_hole_start;
-        u64 search_start = 0;
+        u64 max_hole_size;
+        u64 extent_end;
+        u64 search_start;
        u64 search_end = device->total_bytes;
        int ret;
-        int slot = 0;
+        int slot;
-        int start_found;
        struct extent_buffer *l;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        path->reada = 2;
-        start_found = 0;
        /* FIXME use last free of some kind */
        /* we don't want to overwrite the superblock on the drive,
         * so we make sure to start at an offset of at least 1MB
         */
-        search_start = max((u64)1024 * 1024, search_start);
+        search_start = 1024 * 1024;
-        if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+        if (root->fs_info->alloc_start + num_bytes <= search_end)
                search_start = max(root->fs_info->alloc_start, search_start);
+        max_hole_start = search_start;
+        max_hole_size = 0;
+        if (search_start >= search_end) {
+                ret = -ENOSPC;
+                goto error;
+        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        path->reada = 2;
        key.objectid = device->devid;
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
-                goto error;
+                goto out;
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid, key.type);
                if (ret < 0)
-                        goto error;
+                        goto out;
-                if (ret > 0)
-                        start_found = 1;
        }
-        l = path->nodes[0];
-        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
        while (1) {
                l = path->nodes[0];
                slot = path->slots[0];
@@ -787,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
                        if (ret == 0)
                                continue;
                        if (ret < 0)
-                                goto error;
+                                goto out;
-no_more_items:
-                        if (!start_found) {
+                        break;
-                                if (search_start >= search_end) {
-                                        ret = -ENOSPC;
-                                        goto error;
-                                }
-                                *start = search_start;
-                                start_found = 1;
-                                goto check_pending;
-                        }
-                        *start = last_byte > search_start ?
-                                last_byte : search_start;
-                        if (search_end <= *start) {
-                                ret = -ENOSPC;
-                                goto error;
-                        }
-                        goto check_pending;
                }
                btrfs_item_key_to_cpu(l, &key, slot);
@@ -812,48 +911,62 @@ no_more_items:
                        goto next;
                if (key.objectid > device->devid)
-                        goto no_more_items;
+                        break;
-                if (key.offset >= search_start && key.offset > last_byte &&
+                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-                    start_found) {
+                        goto next;
-                        if (last_byte < search_start)
-                                last_byte = search_start;
-                        hole_size = key.offset - last_byte;
-                        if (hole_size > *max_avail)
+                if (key.offset > search_start) {
-                                *max_avail = hole_size;
+                        hole_size = key.offset - search_start;
-                        if (key.offset > last_byte &&
+                        if (hole_size > max_hole_size) {
-                            hole_size >= num_bytes) {
+                                max_hole_start = search_start;
-                                *start = last_byte;
+                                max_hole_size = hole_size;
-                                goto check_pending;
+                        }
+                        /*
+                         * If this free space is greater than which we need,
+                         * it must be the max free space that we have found
+                         * until now, so max_hole_start must point to the start
+                         * of this free space and the length of this free space
+                         * is stored in max_hole_size. Thus, we return
+                         * max_hole_start and max_hole_size and go back to the
+                         * caller.
+                         */
+                        if (hole_size >= num_bytes) {
+                                ret = 0;
+                                goto out;
                        }
                }
-                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-                        goto next;
-                start_found = 1;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-                last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+                extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                  dev_extent);
+                if (extent_end > search_start)
+                        search_start = extent_end;
 next:
                path->slots[0]++;
                cond_resched();
        }
-check_pending:
-        /* we have to make sure we didn't find an extent that has already
-         * been allocated by the map tree or the original allocation
-         */
-        BUG_ON(*start < search_start);
-        if (*start + num_bytes > search_end) {
+        hole_size = search_end- search_start;
-                ret = -ENOSPC;
+        if (hole_size > max_hole_size) {
-                goto error;
+                max_hole_start = search_start;
+                max_hole_size = hole_size;
        }
-        /* check for pending inserts here */
-        ret = 0;
-error:
+        /* See above. */
+        if (hole_size < num_bytes)
+                ret = -ENOSPC;
+        else
+                ret = 0;
+out:
        btrfs_free_path(path);
+error:
+        *start = max_hole_start;
+        if (len)
+                *len = max_hole_size;
        return ret;
 }
@@ -1183,8 +1296,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                        goto out;
                }
        } else {
-                bdev = open_bdev_exclusive(device_path, FMODE_READ,
+                bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
-                                      root->fs_info->bdev_holder);
+                                          root->fs_info->bdev_holder);
                if (IS_ERR(bdev)) {
                        ret = PTR_ERR(bdev);
                        goto out;
@@ -1193,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
                if (!bh) {
-                        ret = -EIO;
+                        ret = -EINVAL;
                        goto error_close;
                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1251,7 +1364,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
        if (device->bdev) {
-                close_bdev_exclusive(device->bdev, device->mode);
+                blkdev_put(device->bdev, device->mode);
                device->bdev = NULL;
                device->fs_devices->open_devices--;
        }
@@ -1294,7 +1407,7 @@ error_brelse:
        brelse(bh);
 error_close:
        if (bdev)
-                close_bdev_exclusive(bdev, FMODE_READ);
+                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
@@ -1446,7 +1559,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
                return -EINVAL;
-        bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+        bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+                                  root->fs_info->bdev_holder);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
@@ -1572,7 +1686,7 @@ out:
        mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 error:
-        close_bdev_exclusive(bdev, 0);
+        blkdev_put(bdev, FMODE_EXCL);
        if (seeding_dev) {
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
@@ -1912,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
        if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
        mutex_lock(&dev_root->fs_info->volume_mutex);
        dev_root = dev_root->fs_info->dev_root;
@@ -2150,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
                return calc_size * num_stripes;
 }
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+/* Used to sort the devices by max_avail(descending sort) */
-                               struct btrfs_root *extent_root,
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
-                               struct map_lookup **map_ret,
-                               u64 *num_bytes, u64 *stripe_size,
-                               u64 start, u64 type)
 {
-        struct btrfs_fs_info *info = extent_root->fs_info;
+        if (((struct btrfs_device_info *)dev_info1)->max_avail >
-        struct btrfs_device *device = NULL;
+            ((struct btrfs_device_info *)dev_info2)->max_avail)
-        struct btrfs_fs_devices *fs_devices = info->fs_devices;
+                return -1;
-        struct list_head *cur;
+        else if (((struct btrfs_device_info *)dev_info1)->max_avail <
-        struct map_lookup *map = NULL;
+                 ((struct btrfs_device_info *)dev_info2)->max_avail)
-        struct extent_map_tree *em_tree;
+                return 1;
-        struct extent_map *em;
+        else
-        struct list_head private_devs;
+                return 0;
-        int min_stripe_size = 1 * 1024 * 1024;
+}
-        u64 calc_size = 1024 * 1024 * 1024;
-        u64 max_chunk_size = calc_size;
-        u64 min_free;
-        u64 avail;
-        u64 max_avail = 0;
-        u64 dev_offset;
-        int num_stripes = 1;
-        int min_stripes = 1;
-        int sub_stripes = 0;
-        int looped = 0;
-        int ret;
-        int index;
-        int stripe_len = 64 * 1024;
-        if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
-            (type & BTRFS_BLOCK_GROUP_DUP)) {
+                                 int *num_stripes, int *min_stripes,
-                WARN_ON(1);
+                                 int *sub_stripes)
-                type &= ~BTRFS_BLOCK_GROUP_DUP;
+{
-        }
+        *num_stripes = 1;
-        if (list_empty(&fs_devices->alloc_list))
+        *min_stripes = 1;
-                return -ENOSPC;
+        *sub_stripes = 0;
        if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-                num_stripes = fs_devices->rw_devices;
+                *num_stripes = fs_devices->rw_devices;
-                min_stripes = 2;
+                *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-                num_stripes = 2;
+                *num_stripes = 2;
-                min_stripes = 2;
+                *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
                if (fs_devices->rw_devices < 2)
                        return -ENOSPC;
-                num_stripes = 2;
+                *num_stripes = 2;
-                min_stripes = 2;
+                *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-                num_stripes = fs_devices->rw_devices;
+                *num_stripes = fs_devices->rw_devices;
-                if (num_stripes < 4)
+                if (*num_stripes < 4)
                        return -ENOSPC;
-                num_stripes &= ~(u32)1;
+                *num_stripes &= ~(u32)1;
-                sub_stripes = 2;
+                *sub_stripes = 2;
-                min_stripes = 4;
+                *min_stripes = 4;
        }
+        return 0;
+}
+static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+                                    u64 proposed_size, u64 type,
+                                    int num_stripes, int small_stripe)
+{
+        int min_stripe_size = 1 * 1024 * 1024;
+        u64 calc_size = proposed_size;
+        u64 max_chunk_size = calc_size;
+        int ncopies = 1;
+        if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+                    BTRFS_BLOCK_GROUP_DUP |
+                    BTRFS_BLOCK_GROUP_RAID10))
+                ncopies = 2;
        if (type & BTRFS_BLOCK_GROUP_DATA) {
                max_chunk_size = 10 * calc_size;
                min_stripe_size = 64 * 1024 * 1024;
@@ -2226,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
                             max_chunk_size);
-again:
+        if (calc_size * num_stripes > max_chunk_size * ncopies) {
-        max_avail = 0;
+                calc_size = max_chunk_size * ncopies;
-        if (!map || map->num_stripes != num_stripes) {
-                kfree(map);
-                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-                if (!map)
-                        return -ENOMEM;
-                map->num_stripes = num_stripes;
-        }
-        if (calc_size * num_stripes > max_chunk_size) {
-                calc_size = max_chunk_size;
                do_div(calc_size, num_stripes);
-                do_div(calc_size, stripe_len);
+                do_div(calc_size, BTRFS_STRIPE_LEN);
-                calc_size *= stripe_len;
+                calc_size *= BTRFS_STRIPE_LEN;
        }
        /* we don't want tiny stripes */
-        if (!looped)
+        if (!small_stripe)
                calc_size = max_t(u64, min_stripe_size, calc_size);
        /*
-         * we're about to do_div by the stripe_len so lets make sure
+         * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
         * we end up with something bigger than a stripe
         */
-        calc_size = max_t(u64, calc_size, stripe_len * 4);
+        calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+        do_div(calc_size, BTRFS_STRIPE_LEN);
+        calc_size *= BTRFS_STRIPE_LEN;
+        return calc_size;
+}
+static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
+                                                      int num_stripes)
+{
+        struct map_lookup *new;
+        size_t len = map_lookup_size(num_stripes);
+        BUG_ON(map->num_stripes < num_stripes);
+        if (map->num_stripes == num_stripes)
+                return map;
+        new = kmalloc(len, GFP_NOFS);
+        if (!new) {
+                /* just change map->num_stripes */
+                map->num_stripes = num_stripes;
+                return map;
+        }
+        memcpy(new, map, len);
+        new->num_stripes = num_stripes;
+        kfree(map);
+        return new;
+}
+/*
+ * helper to allocate device space from btrfs_device_info, in which we stored
+ * max free space information of every device. It is used when we can not
+ * allocate chunks by default size.
+ *
+ * By this helper, we can allocate a new chunk as larger as possible.
+ */
+static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+                                    struct btrfs_fs_devices *fs_devices,
+                                    struct btrfs_device_info *devices,
+                                    int nr_device, u64 type,
+                                    struct map_lookup **map_lookup,
+                                    int min_stripes, u64 *stripe_size)
+{
+        int i, index, sort_again = 0;
+        int min_devices = min_stripes;
+        u64 max_avail, min_free;
+        struct map_lookup *map = *map_lookup;
+        int ret;
+        if (nr_device < min_stripes)
+                return -ENOSPC;
+        btrfs_descending_sort_devices(devices, nr_device);
-        do_div(calc_size, stripe_len);
+        max_avail = devices[0].max_avail;
-        calc_size *= stripe_len;
+        if (!max_avail)
+                return -ENOSPC;
+        for (i = 0; i < nr_device; i++) {
+                /*
+                 * if dev_offset = 0, it means the free space of this device
+                 * is less than what we need, and we didn't search max avail
+                 * extent on this device, so do it now.
+                 */
+                if (!devices[i].dev_offset) {
+                        ret = find_free_dev_extent(trans, devices[i].dev,
+                                                   max_avail,
+                                                   &devices[i].dev_offset,
+                                                   &devices[i].max_avail);
+                        if (ret != 0 && ret != -ENOSPC)
+                                return ret;
+                        sort_again = 1;
+                }
+        }
+        /* we update the max avail free extent of each devices, sort again */
+        if (sort_again)
+                btrfs_descending_sort_devices(devices, nr_device);
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                min_devices = 1;
+        if (!devices[min_devices - 1].max_avail)
+                return -ENOSPC;
+        max_avail = devices[min_devices - 1].max_avail;
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                do_div(max_avail, 2);
+        max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+                                             min_stripes, 1);
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                min_free = max_avail * 2;
+        else
+                min_free = max_avail;
+        if (min_free > devices[min_devices - 1].max_avail)
+                return -ENOSPC;
+        map = __shrink_map_lookup_stripes(map, min_stripes);
+        *stripe_size = max_avail;
+        index = 0;
+        for (i = 0; i < min_stripes; i++) {
+                map->stripes[i].dev = devices[index].dev;
+                map->stripes[i].physical = devices[index].dev_offset;
+                if (type & BTRFS_BLOCK_GROUP_DUP) {
+                        i++;
+                        map->stripes[i].dev = devices[index].dev;
+                        map->stripes[i].physical = devices[index].dev_offset +
+                                                   max_avail;
+                }
+                index++;
+        }
+        *map_lookup = map;
+        return 0;
+}
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root,
+                               struct map_lookup **map_ret,
+                               u64 *num_bytes, u64 *stripe_size,
+                               u64 start, u64 type)
+{
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct btrfs_device *device = NULL;
+        struct btrfs_fs_devices *fs_devices = info->fs_devices;
+        struct list_head *cur;
+        struct map_lookup *map;
+        struct extent_map_tree *em_tree;
+        struct extent_map *em;
+        struct btrfs_device_info *devices_info;
+        struct list_head private_devs;
+        u64 calc_size = 1024 * 1024 * 1024;
+        u64 min_free;
+        u64 avail;
+        u64 dev_offset;
+        int num_stripes;
+        int min_stripes;
+        int sub_stripes;
+        int min_devices;        /* the min number of devices we need */
+        int i;
+        int ret;
+        int index;
+        if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+            (type & BTRFS_BLOCK_GROUP_DUP)) {
+                WARN_ON(1);
+                type &= ~BTRFS_BLOCK_GROUP_DUP;
+        }
+        if (list_empty(&fs_devices->alloc_list))
+                return -ENOSPC;
+        ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+                                    &min_stripes, &sub_stripes);
+        if (ret)
+                return ret;
+        devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+                               GFP_NOFS);
+        if (!devices_info)
+                return -ENOMEM;
+        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+        if (!map) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        map->num_stripes = num_stripes;
        cur = fs_devices->alloc_list.next;
        index = 0;
+        i = 0;
-        if (type & BTRFS_BLOCK_GROUP_DUP)
+        calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+                                             num_stripes, 0);
+        if (type & BTRFS_BLOCK_GROUP_DUP) {
                min_free = calc_size * 2;
-        else
+                min_devices = 1;
+        } else {
                min_free = calc_size;
+                min_devices = min_stripes;
-        /*
+        }
-         * we add 1MB because we never use the first 1MB of the device, unless
-         * we've looped, then we are likely allocating the maximum amount of
-         * space left already
-         */
-        if (!looped)
-                min_free += 1024 * 1024;
        INIT_LIST_HEAD(&private_devs);
        while (index < num_stripes) {
@@ -2283,27 +2559,39 @@ again:
                cur = cur->next;
                if (device->in_fs_metadata && avail >= min_free) {
-                        ret = find_free_dev_extent(trans, device,
+                        ret = find_free_dev_extent(trans, device, min_free,
-                                                   min_free, &dev_offset,
+                                                   &devices_info[i].dev_offset,
-                                                   &max_avail);
+                                                   &devices_info[i].max_avail);
                        if (ret == 0) {
                                list_move_tail(&device->dev_alloc_list,
                                               &private_devs);
                                map->stripes[index].dev = device;
-                                map->stripes[index].physical = dev_offset;
+                                map->stripes[index].physical =
+                                                devices_info[i].dev_offset;
                                index++;
                                if (type & BTRFS_BLOCK_GROUP_DUP) {
                                        map->stripes[index].dev = device;
                                        map->stripes[index].physical =
-                                                dev_offset + calc_size;
+                                                devices_info[i].dev_offset +
+                                                calc_size;
                                        index++;
                                }
-                        }
+                        } else if (ret != -ENOSPC)
-                } else if (device->in_fs_metadata && avail > max_avail)
+                                goto error;
-                        max_avail = avail;
+                        devices_info[i].dev = device;
+                        i++;
+                } else if (device->in_fs_metadata &&
+                           avail >= BTRFS_STRIPE_LEN) {
+                        devices_info[i].dev = device;
+                        devices_info[i].max_avail = avail;
+                        i++;
+                }
                if (cur == &fs_devices->alloc_list)
                        break;
        }
        list_splice(&private_devs, &fs_devices->alloc_list);
        if (index < num_stripes) {
                if (index >= min_stripes) {
@@ -2312,34 +2600,36 @@ again:
                                num_stripes /= sub_stripes;
                                num_stripes *= sub_stripes;
                        }
-                        looped = 1;
-                        goto again;
+                        map = __shrink_map_lookup_stripes(map, num_stripes);
-                }
+                } else if (i >= min_devices) {
-                if (!looped && max_avail > 0) {
+                        ret = __btrfs_alloc_tiny_space(trans, fs_devices,
-                        looped = 1;
+                                                       devices_info, i, type,
-                        calc_size = max_avail;
+                                                       &map, min_stripes,
-                        goto again;
+                                                       &calc_size);
+                        if (ret)
+                                goto error;
+                } else {
+                        ret = -ENOSPC;
+                        goto error;
                }
-                kfree(map);
-                return -ENOSPC;
        }
        map->sector_size = extent_root->sectorsize;
-        map->stripe_len = stripe_len;
+        map->stripe_len = BTRFS_STRIPE_LEN;
-        map->io_align = stripe_len;
+        map->io_align = BTRFS_STRIPE_LEN;
-        map->io_width = stripe_len;
+        map->io_width = BTRFS_STRIPE_LEN;
        map->type = type;
-        map->num_stripes = num_stripes;
        map->sub_stripes = sub_stripes;
        *map_ret = map;
        *stripe_size = calc_size;
        *num_bytes = chunk_bytes_by_type(type, calc_size,
-                                         num_stripes, sub_stripes);
+                                         map->num_stripes, sub_stripes);
        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
-                kfree(map);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto error;
        }
        em->bdev = (struct block_device *)map;
        em->start = start;
@@ -2372,7 +2662,13 @@ again:
                index++;
        }
+        kfree(devices_info);
        return 0;
+error:
+        kfree(map);
+        kfree(devices_info);
+        return ret;
 }
 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2740db49eb04..7fb59d45fe8c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
 #define __BTRFS_VOLUMES_
 #include <linux/bio.h>
+#include <linux/sort.h>
 #include "async-thread.h"
+#define BTRFS_STRIPE_LEN        (64 * 1024)
 struct buffer_head;
 struct btrfs_pending_bios {
        struct bio *head;
@@ -50,7 +53,7 @@ struct btrfs_device {
        struct block_device *bdev;
-        /* the mode sent to open_bdev_exclusive */
+        /* the mode sent to blkdev_get */
        fmode_t mode;
        char *name;
@@ -136,6 +139,30 @@ struct btrfs_multi_bio {
        struct btrfs_bio_stripe stripes[];
 };
+struct btrfs_device_info {
+        struct btrfs_device *dev;
+        u64 dev_offset;
+        u64 max_avail;
+};
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+                                        struct btrfs_device_info *devices,
+                                        size_t nr_devices)
+{
+        sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+             btrfs_cmp_device_free_bytes, NULL);
+}
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                   u64 end, u64 *length);
 #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739c..a5776531dc2b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                   size_t size, int flags)
 {
+        struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+        /*
+         * The permission on security.* and system.* is not checked
+         * in permission().
+         */
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        /*
         * If this is a request for a synthetic attribute in the system.*
         * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
+        struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+        /*
+         * The permission on security.* and system.* is not checked
+         * in permission().
+         */
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        /*
         * If this is a request for a synthetic attribute in the system.*
         * namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71c..f5ec2d44150d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
 #include <linux/bio.h>
 #include "compression.h"
-/* Plan: call deflate() with avail_in == *sourcelen,
-        avail_out = *dstlen - 12 and flush == Z_FINISH.
-        If it doesn't manage to finish, call it again with
-        avail_in == 0 and avail_out set to the remaining 12
-        bytes for it to clean up.
-   Q: Is 12 bytes sufficient?
-*/
-#define STREAM_END_SPACE 12
 struct workspace {
        z_stream inf_strm;
        z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
        struct list_head list;
 };
-static LIST_HEAD(idle_workspace);
+static void zlib_free_workspace(struct list_head *ws)
-static DEFINE_SPINLOCK(workspace_lock);
+{
-static unsigned long num_workspace;
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
-static atomic_t alloc_workspace = ATOMIC_INIT(0);
-static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
-/*
+        vfree(workspace->def_strm.workspace);
- * this finds an available zlib workspace or allocates a new one
+        vfree(workspace->inf_strm.workspace);
- * NULL or an ERR_PTR is returned if things go bad.
+        kfree(workspace->buf);
- */
+        kfree(workspace);
-static struct workspace *find_zlib_workspace(void)
+}
+static struct list_head *zlib_alloc_workspace(void)
 {
        struct workspace *workspace;
-        int ret;
-        int cpus = num_online_cpus();
-again:
-        spin_lock(&workspace_lock);
-        if (!list_empty(&idle_workspace)) {
-                workspace = list_entry(idle_workspace.next, struct workspace,
-                                       list);
-                list_del(&workspace->list);
-                num_workspace--;
-                spin_unlock(&workspace_lock);
-                return workspace;
-        }
-        spin_unlock(&workspace_lock);
-        if (atomic_read(&alloc_workspace) > cpus) {
-                DEFINE_WAIT(wait);
-                prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-                if (atomic_read(&alloc_workspace) > cpus)
-                        schedule();
-                finish_wait(&workspace_wait, &wait);
-                goto again;
-        }
-        atomic_inc(&alloc_workspace);
        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
-        if (!workspace) {
+        if (!workspace)
-                ret = -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-                goto fail;
-        }
        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
-        if (!workspace->def_strm.workspace) {
-                ret = -ENOMEM;
-                goto fail;
-        }
        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
-        if (!workspace->inf_strm.workspace) {
-                ret = -ENOMEM;
-                goto fail_inflate;
-        }
        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-        if (!workspace->buf) {
+        if (!workspace->def_strm.workspace ||
-                ret = -ENOMEM;
+            !workspace->inf_strm.workspace || !workspace->buf)
-                goto fail_kmalloc;
+                goto fail;
-        }
-        return workspace;
-fail_kmalloc:
-        vfree(workspace->inf_strm.workspace);
-fail_inflate:
-        vfree(workspace->def_strm.workspace);
-fail:
-        kfree(workspace);
-        atomic_dec(&alloc_workspace);
-        wake_up(&workspace_wait);
-        return ERR_PTR(ret);
-}
-/*
- * put a workspace struct back on the list or free it if we have enough
- * idle ones sitting around
- */
-static int free_workspace(struct workspace *workspace)
-{
-        spin_lock(&workspace_lock);
-        if (num_workspace < num_online_cpus()) {
-                list_add_tail(&workspace->list, &idle_workspace);
-                num_workspace++;
-                spin_unlock(&workspace_lock);
-                if (waitqueue_active(&workspace_wait))
-                        wake_up(&workspace_wait);
-                return 0;
-        }
-        spin_unlock(&workspace_lock);
-        vfree(workspace->def_strm.workspace);
-        vfree(workspace->inf_strm.workspace);
-        kfree(workspace->buf);
-        kfree(workspace);
-        atomic_dec(&alloc_workspace);
+        INIT_LIST_HEAD(&workspace->list);
-        if (waitqueue_active(&workspace_wait))
-                wake_up(&workspace_wait);
-        return 0;
-}
-/*
+        return &workspace->list;
- * cleanup function for module exit
+fail:
- */
+        zlib_free_workspace(&workspace->list);
-static void free_workspaces(void)
+        return ERR_PTR(-ENOMEM);
-{
-        struct workspace *workspace;
-        while (!list_empty(&idle_workspace)) {
-                workspace = list_entry(idle_workspace.next, struct workspace,
-                                       list);
-                list_del(&workspace->list);
-                vfree(workspace->def_strm.workspace);
-                vfree(workspace->inf_strm.workspace);
-                kfree(workspace->buf);
-                kfree(workspace);
-                atomic_dec(&alloc_workspace);
-        }
 }
-/*
+static int zlib_compress_pages(struct list_head *ws,
- * given an address space and start/len, compress the bytes.
+                               struct address_space *mapping,
- *
+                               u64 start, unsigned long len,
- * pages are allocated to hold the compressed result and stored
+                               struct page **pages,
- * in 'pages'
+                               unsigned long nr_dest_pages,
- *
+                               unsigned long *out_pages,
- * out_pages is used to return the number of pages allocated.  There
+                               unsigned long *total_in,
- * may be pages allocated even if we return an error
+                               unsigned long *total_out,
- *
+                               unsigned long max_out)
- * total_in is used to return the number of bytes actually read.  It
- * may be smaller then len if we had to exit early because we
- * ran out of room in the pages array or because we cross the
- * max_out threshold.
- *
- * total_out is used to return the total number of compressed bytes
- *
- * max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
- */
-int btrfs_zlib_compress_pages(struct address_space *mapping,
-                              u64 start, unsigned long len,
-                              struct page **pages,
-                              unsigned long nr_dest_pages,
-                              unsigned long *out_pages,
-                              unsigned long *total_in,
-                              unsigned long *total_out,
-                              unsigned long max_out)
 {
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
        int ret;
-        struct workspace *workspace;
        char *data_in;
        char *cpage_out;
        int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        *total_out = 0;
        *total_in = 0;
-        workspace = find_zlib_workspace();
-        if (IS_ERR(workspace))
-                return -1;
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
                printk(KERN_WARNING "deflateInit failed\n");
                ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        data_in = kmap(in_page);
        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        if (out_page == NULL) {
+                ret = -1;
+                goto out;
+        }
        cpage_out = kmap(out_page);
        pages[0] = out_page;
        nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
                                goto out;
                        }
                        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                        if (out_page == NULL) {
+                                ret = -1;
+                                goto out;
+                        }
                        cpage_out = kmap(out_page);
                        pages[nr_pages] = out_page;
                        nr_pages++;
@@ -314,55 +208,26 @@ out:
                kunmap(in_page);
                page_cache_release(in_page);
        }
-        free_workspace(workspace);
        return ret;
 }
-/*
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
- * pages_in is an array of pages with compressed data.
+                                  u64 disk_start,
- *
+                                  struct bio_vec *bvec,
- * disk_start is the starting logical offset of this array in the file
+                                  int vcnt,
- *
+                                  size_t srclen)
- * bvec is a bio_vec of pages from the file that we want to decompress into
- *
- * vcnt is the count of pages in the biovec
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous.  They all correspond to the range of bytes covered by
- * the compressed extent.
- */
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
-                              u64 disk_start,
-                              struct bio_vec *bvec,
-                              int vcnt,
-                              size_t srclen)
 {
-        int ret = 0;
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        int ret = 0, ret2;
        int wbits = MAX_WBITS;
-        struct workspace *workspace;
        char *data_in;
        size_t total_out = 0;
-        unsigned long page_bytes_left;
        unsigned long page_in_index = 0;
        unsigned long page_out_index = 0;
-        struct page *page_out;
        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
                                        PAGE_CACHE_SIZE;
        unsigned long buf_start;
-        unsigned long buf_offset;
-        unsigned long bytes;
-        unsigned long working_bytes;
        unsigned long pg_offset;
-        unsigned long start_byte;
-        unsigned long current_buf_start;
-        char *kaddr;
-        workspace = find_zlib_workspace();
-        if (IS_ERR(workspace))
-                return -ENOMEM;
        data_in = kmap(pages_in[page_in_index]);
        workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        workspace->inf_strm.total_out = 0;
        workspace->inf_strm.next_out = workspace->buf;
        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
-        page_out = bvec[page_out_index].bv_page;
-        page_bytes_left = PAGE_CACHE_SIZE;
        pg_offset = 0;
        /* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
                printk(KERN_WARNING "inflateInit failed\n");
-                ret = -1;
+                return -1;
-                goto out;
        }
        while (workspace->inf_strm.total_in < srclen) {
                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
                if (ret != Z_OK && ret != Z_STREAM_END)
                        break;
-                /*
-                 * buf start is the byte offset we're of the start of
-                 * our workspace buffer
-                 */
-                buf_start = total_out;
-                /* total_out is the last byte of the workspace buffer */
+                buf_start = total_out;
                total_out = workspace->inf_strm.total_out;
-                working_bytes = total_out - buf_start;
+                /* we didn't make progress in this inflate call, we're done */
+                if (buf_start == total_out)
-                /*
-                 * start byte is the first byte of the page we're currently
-                 * copying into relative to the start of the compressed data.
-                 */
-                start_byte = page_offset(page_out) - disk_start;
-                if (working_bytes == 0) {
-                        /* we didn't make progress in this inflate
-                         * call, we're done
-                         */
-                        if (ret != Z_STREAM_END)
-                                ret = -1;
                        break;
-                }
-                /* we haven't yet hit data corresponding to this page */
+                ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
-                if (total_out <= start_byte)
+                                                 total_out, disk_start,
-                        goto next;
+                                                 bvec, vcnt,
+                                                 &page_out_index, &pg_offset);
-                /*
+                if (ret2 == 0) {
-                 * the start of the data we care about is offset into
+                        ret = 0;
-                 * the middle of our working buffer
+                        goto done;
-                 */
-                if (total_out > start_byte && buf_start < start_byte) {
-                        buf_offset = start_byte - buf_start;
-                        working_bytes -= buf_offset;
-                } else {
-                        buf_offset = 0;
-                }
-                current_buf_start = buf_start;
-                /* copy bytes from the working buffer into the pages */
-                while (working_bytes > 0) {
-                        bytes = min(PAGE_CACHE_SIZE - pg_offset,
-                                    PAGE_CACHE_SIZE - buf_offset);
-                        bytes = min(bytes, working_bytes);
-                        kaddr = kmap_atomic(page_out, KM_USER0);
-                        memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
-                               bytes);
-                        kunmap_atomic(kaddr, KM_USER0);
-                        flush_dcache_page(page_out);
-                        pg_offset += bytes;
-                        page_bytes_left -= bytes;
-                        buf_offset += bytes;
-                        working_bytes -= bytes;
-                        current_buf_start += bytes;
-                        /* check if we need to pick another page */
-                        if (page_bytes_left == 0) {
-                                page_out_index++;
-                                if (page_out_index >= vcnt) {
-                                        ret = 0;
-                                        goto done;
-                                }
-                                page_out = bvec[page_out_index].bv_page;
-                                pg_offset = 0;
-                                page_bytes_left = PAGE_CACHE_SIZE;
-                                start_byte = page_offset(page_out) - disk_start;
-                                /*
-                                 * make sure our new page is covered by this
-                                 * working buffer
-                                 */
-                                if (total_out <= start_byte)
-                                        goto next;
-                                /* the next page in the biovec might not
-                                 * be adjacent to the last page, but it
-                                 * might still be found inside this working
-                                 * buffer.  bump our offset pointer
-                                 */
-                                if (total_out > start_byte &&
-                                    current_buf_start < start_byte) {
-                                        buf_offset = start_byte - buf_start;
-                                        working_bytes = total_out - start_byte;
-                                        current_buf_start = buf_start +
-                                                buf_offset;
-                                }
-                        }
                }
-next:
                workspace->inf_strm.next_out = workspace->buf;
                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
@@ -516,35 +301,21 @@ done:
        zlib_inflateEnd(&workspace->inf_strm);
        if (data_in)
                kunmap(pages_in[page_in_index]);
-out:
-        free_workspace(workspace);
        return ret;
 }
-/*
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- * a less complex decompression routine.  Our compressed data fits in a
+                           struct page *dest_page,
- * single page, and we want to read a single page out of it.
+                           unsigned long start_byte,
- * start_byte tells us the offset into the compressed data we're interested in
+                           size_t srclen, size_t destlen)
- */
-int btrfs_zlib_decompress(unsigned char *data_in,
-                          struct page *dest_page,
-                          unsigned long start_byte,
-                          size_t srclen, size_t destlen)
 {
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
        int ret = 0;
        int wbits = MAX_WBITS;
-        struct workspace *workspace;
        unsigned long bytes_left = destlen;
        unsigned long total_out = 0;
        char *kaddr;
-        if (destlen > PAGE_CACHE_SIZE)
-                return -ENOMEM;
-        workspace = find_zlib_workspace();
-        if (IS_ERR(workspace))
-                return -ENOMEM;
        workspace->inf_strm.next_in = data_in;
        workspace->inf_strm.avail_in = srclen;
        workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
                printk(KERN_WARNING "inflateInit failed\n");
-                ret = -1;
+                return -1;
-                goto out;
        }
        while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
                ret = 0;
        zlib_inflateEnd(&workspace->inf_strm);
-out:
-        free_workspace(workspace);
        return ret;
 }
-void btrfs_zlib_exit(void)
+struct btrfs_compress_op btrfs_zlib_compress = {
-{
+        .alloc_workspace        = zlib_alloc_workspace,
-    free_workspaces();
+        .free_workspace         = zlib_free_workspace,
-}
+        .compress_pages         = zlib_compress_pages,
+        .decompress_biovec      = zlib_decompress_biovec,
+        .decompress             = zlib_decompress,
+};
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 9e6c4f2e8ff1..bd352125e829 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,31 +2,10 @@
 # Makefile for CEPH filesystem.
 #
-ifneq ($(KERNELRELEASE),)
 obj-$(CONFIG_CEPH_FS) += ceph.o
-ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
        export.o caps.o snap.o xattr.o \
        mds_client.o mdsmap.o strings.o ceph_frag.o \
        debugfs.o
-else
-#Otherwise we were called directly from the command
-# line; invoke the kernel build system.
-KERNELDIR ?= /lib/modules/$(shell uname -r)/build
-PWD := $(shell pwd)
-default: all
-all:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
-modules_install:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
-clean:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
-endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7ae1b3d55b58..08f65faac112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
        for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
                req = rb_entry(rp, struct ceph_mds_request, r_node);
-                if (req->r_request)
+                if (req->r_request && req->r_session)
-                        seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+                        seq_printf(s, "%lld\tmds%d\t", req->r_tid,
-                else
+                                   req->r_session->s_mds);
+                else if (!req->r_request)
                        seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+                else
+                        seq_printf(s, "%lld\t(no session)\t", req->r_tid);
                seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fa7ca04ee816..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1224,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
        }
 }
+/*
+ * Return name hash for a given dentry.  This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct dentry *dn)
+{
+        struct inode *dir = dn->d_parent->d_inode;
+        struct ceph_inode_info *dci = ceph_inode(dir);
+        switch (dci->i_dir_layout.dl_dir_hash) {
+        case 0: /* for backward compat */
+        case CEPH_STR_HASH_LINUX:
+                return dn->d_name.hash;
+        default:
+                return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+                                     dn->d_name.name, dn->d_name.len);
+        }
+}
 const struct file_operations ceph_dir_fops = {
        .read = ceph_read_dir,
        .readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 2297d9426992..e41056174bf8 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                dout("encode_fh %p connectable\n", dentry);
                cfh->ino = ceph_ino(dentry->d_inode);
                cfh->parent_ino = ceph_ino(parent->d_inode);
-                cfh->parent_name_hash = parent->d_name.hash;
+                cfh->parent_name_hash = ceph_dentry_hash(parent);
                *max_len = connected_handle_length;
                type = 2;
        } else if (*max_len >= handle_length) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e61de4f7b99d..e835eff551e3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_release_count = 0;
        ci->i_symlink = NULL;
+        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
        ci->i_fragtree = RB_ROOT;
        mutex_init(&ci->i_fragtree_mutex);
@@ -689,6 +691,8 @@ static int fill_inode(struct inode *inode,
                inode->i_op = &ceph_dir_iops;
                inode->i_fop = &ceph_dir_fops;
+                ci->i_dir_layout = iinfo->dir_layout;
                ci->i_files = le64_to_cpu(info->files);
                ci->i_subdirs = le64_to_cpu(info->subdirs);
                ci->i_rbytes = le64_to_cpu(info->rbytes);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a50fca1e03be..1e30d194a8e3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -60,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
 * parse individual inode info
 */
 static int parse_reply_info_in(void **p, void *end,
-                               struct ceph_mds_reply_info_in *info)
+                               struct ceph_mds_reply_info_in *info,
+                               int features)
 {
        int err = -EIO;
@@ -74,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
        info->symlink = *p;
        *p += info->symlink_len;
+        if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+                ceph_decode_copy_safe(p, end, &info->dir_layout,
+                                      sizeof(info->dir_layout), bad);
+        else
+                memset(&info->dir_layout, 0, sizeof(info->dir_layout));
        ceph_decode_32_safe(p, end, info->xattr_len, bad);
        ceph_decode_need(p, end, info->xattr_len, bad);
        info->xattr_data = *p;
@@ -88,12 +95,13 @@ bad:
 * target inode.
 */
 static int parse_reply_info_trace(void **p, void *end,
-                                  struct ceph_mds_reply_info_parsed *info)
+                                  struct ceph_mds_reply_info_parsed *info,
+                                  int features)
 {
        int err;
        if (info->head->is_dentry) {
-                err = parse_reply_info_in(p, end, &info->diri);
+                err = parse_reply_info_in(p, end, &info->diri, features);
                if (err < 0)
                        goto out_bad;
@@ -114,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
        }
        if (info->head->is_target) {
-                err = parse_reply_info_in(p, end, &info->targeti);
+                err = parse_reply_info_in(p, end, &info->targeti, features);
                if (err < 0)
                        goto out_bad;
        }
@@ -134,7 +142,8 @@ out_bad:
 * parse readdir results
 */
 static int parse_reply_info_dir(void **p, void *end,
-                                struct ceph_mds_reply_info_parsed *info)
+                                struct ceph_mds_reply_info_parsed *info,
+                                int features)
 {
        u32 num, i = 0;
        int err;
@@ -182,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
                *p += sizeof(struct ceph_mds_reply_lease);
                /* inode */
-                err = parse_reply_info_in(p, end, &info->dir_in[i]);
+                err = parse_reply_info_in(p, end, &info->dir_in[i], features);
                if (err < 0)
                        goto out_bad;
                i++;
@@ -205,7 +214,8 @@ out_bad:
 * parse fcntl F_GETLK results
 */
 static int parse_reply_info_filelock(void **p, void *end,
-                struct ceph_mds_reply_info_parsed *info)
+                                     struct ceph_mds_reply_info_parsed *info,
+                                     int features)
 {
        if (*p + sizeof(*info->filelock_reply) > end)
                goto bad;
@@ -225,19 +235,21 @@ bad:
 * parse extra results
 */
 static int parse_reply_info_extra(void **p, void *end,
-                struct ceph_mds_reply_info_parsed *info)
+                                  struct ceph_mds_reply_info_parsed *info,
+                                  int features)
 {
        if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
-                return parse_reply_info_filelock(p, end, info);
+                return parse_reply_info_filelock(p, end, info, features);
        else
-                return parse_reply_info_dir(p, end, info);
+                return parse_reply_info_dir(p, end, info, features);
 }
 /*
 * parse entire mds reply
 */
 static int parse_reply_info(struct ceph_msg *msg,
-                            struct ceph_mds_reply_info_parsed *info)
+                            struct ceph_mds_reply_info_parsed *info,
+                            int features)
 {
        void *p, *end;
        u32 len;
@@ -250,7 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
        /* trace */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_trace(&p, p+len, info);
+                err = parse_reply_info_trace(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
        }
@@ -258,7 +270,7 @@ static int parse_reply_info(struct ceph_msg *msg,
        /* extra */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_extra(&p, p+len, info);
+                err = parse_reply_info_extra(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
        }
@@ -654,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                } else {
                        /* dir + name */
                        inode = dir;
-                        hash = req->r_dentry->d_name.hash;
+                        hash = ceph_dentry_hash(req->r_dentry);
                        is_hash = true;
                }
        }
@@ -1693,7 +1705,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        int flags = 0;
-        req->r_mds = mds;
        req->r_attempts++;
        if (req->r_inode) {
                struct ceph_cap *cap =
@@ -1780,6 +1791,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
                goto finish;
        }
+        put_request_session(req);
        mds = __choose_mds(mdsc, req);
        if (mds < 0 ||
            ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1797,6 +1810,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
                        goto finish;
                }
        }
+        req->r_session = get_session(session);
        dout("do_request mds%d session %p state %s\n", mds, session,
             session_state_name(session->s_state));
        if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1809,7 +1824,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
        }
        /* send request */
-        req->r_session = get_session(session);
        req->r_resend_mds = -1;   /* forget any previous mds hint */
        if (req->r_request_started == 0)   /* note request start time */
@@ -1863,7 +1877,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
                if (req->r_session &&
                    req->r_session->s_mds == mds) {
                        dout(" kicking tid %llu\n", req->r_tid);
-                        put_request_session(req);
                        __do_request(mdsc, req);
                }
        }
@@ -2056,8 +2069,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        goto out;
                } else  {
                        struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-                        struct ceph_cap *cap =
+                        struct ceph_cap *cap = NULL;
-                                ceph_get_cap_for_mds(ci, req->r_mds);;
+                        if (req->r_session)
+                                cap = ceph_get_cap_for_mds(ci,
+                                                   req->r_session->s_mds);
                        dout("already using auth");
                        if ((!cap || cap != ci->i_auth_cap) ||
@@ -2101,7 +2117,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        dout("handle_reply tid %lld result %d\n", tid, result);
        rinfo = &req->r_reply_info;
-        err = parse_reply_info(msg, rinfo);
+        err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
        mutex_unlock(&mdsc->mutex);
        mutex_lock(&session->s_mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index aabe563b54db..4e3a9cc0bba6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -35,6 +35,7 @@ struct ceph_cap;
 */
 struct ceph_mds_reply_info_in {
        struct ceph_mds_reply_inode *in;
+        struct ceph_dir_layout dir_layout;
        u32 symlink_len;
        char *symlink;
        u32 xattr_len;
@@ -165,7 +166,6 @@ struct ceph_mds_request {
        struct ceph_mds_client *r_mdsc;
        int r_op;                    /* mds op code */
-        int r_mds;
        /* operation on what? */
        struct inode *r_inode;              /* arg1 */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 08b460ae0539..bf6f0f34082a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -428,7 +428,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail;
        }
        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
-        fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+        fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
+                CEPH_FEATURE_DIRLAYOUTHASH;
        fsc->client->monc.want_mdsmap = 1;
        fsc->mount_options = fsopt;
@@ -443,13 +444,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail_client;
        err = -ENOMEM;
-        fsc->wb_wq = create_workqueue("ceph-writeback");
+        /*
+         * The number of concurrent works can be high but they don't need
+         * to be processed in parallel, limit concurrency.
+         */
+        fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
        if (fsc->wb_wq == NULL)
                goto fail_bdi;
-        fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+        fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
        if (fsc->pg_inv_wq == NULL)
                goto fail_wb_wq;
-        fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+        fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
        if (fsc->trunc_wq == NULL)
                goto fail_pg_inv_wq;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4553d8829edb..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -239,6 +239,7 @@ struct ceph_inode_info {
        unsigned i_ceph_flags;
        unsigned long i_release_count;
+        struct ceph_dir_layout i_dir_layout;
        struct ceph_file_layout i_layout;
        char *i_symlink;
@@ -768,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct dentry *dn);
 /*
 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/char_dev.c b/fs/char_dev.c
index e5b9df993b93..dca9e5e0f73b 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,7 +59,7 @@ static struct char_device_struct {
 } *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
 /* index in the above */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
 {
        return major % CHRDEV_MAJOR_HASH_SIZE;
 }
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
        return ret;
 }
-int cdev_index(struct inode *inode)
-{
-        int idx;
-        struct kobject *kobj;
-        kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
-        if (!kobj)
-                return -1;
-        kobject_put(kobj);
-        return idx;
-}
 void cd_forget(struct inode *inode)
 {
        spin_lock(&cdev_lock);
@@ -582,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
-EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 224d7bbd1fcc..e654dfd092c3 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
                                   void *buffer, uint16_t maxbuf)
 {
        const struct TCP_Server_Info *server = cookie_netfs_data;
-        const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+        const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+        const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+        const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
        struct cifs_server_key *key = buffer;
        uint16_t key_len = sizeof(struct cifs_server_key);
@@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
         */
        switch (sa->sa_family) {
        case AF_INET:
-                key->family = server->addr.sockAddr.sin_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr.sin_port;
+                key->port = addr->sin_port;
-                key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+                key->addr[0].ipv4_addr = addr->sin_addr;
                key_len += sizeof(key->addr[0].ipv4_addr);
                break;
        case AF_INET6:
-                key->family = server->addr.sockAddr6.sin6_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr6.sin6_port;
+                key->port = addr6->sin6_port;
-                key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+                key->addr[0].ipv6_addr = addr6->sin6_addr;
                key_len += sizeof(key->addr[0].ipv6_addr);
                break;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 103ab8b605b0..65829d32128c 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
        spin_lock(&GlobalMid_Lock);
        list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+                cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
                        mid_entry->midState,
                        (int)mid_entry->command,
                        mid_entry->pid,
-                        mid_entry->tsk,
+                        mid_entry->callback_data,
                        mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
                cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -119,29 +119,27 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                    "Display Internal CIFS Data Structures for Debugging\n"
                    "---------------------------------------------------\n");
        seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
-        seq_printf(m, "Features: ");
+        seq_printf(m, "Features:");
 #ifdef CONFIG_CIFS_DFS_UPCALL
-        seq_printf(m, "dfs");
+        seq_printf(m, " dfs");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_FSCACHE
-        seq_printf(m, "fscache");
+        seq_printf(m, " fscache");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-        seq_printf(m, "lanman");
+        seq_printf(m, " lanman");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_POSIX
-        seq_printf(m, "posix");
+        seq_printf(m, " posix");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_UPCALL
-        seq_printf(m, "spnego");
+        seq_printf(m, " spnego");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_XATTR
-        seq_printf(m, "xattr");
+        seq_printf(m, " xattr");
+#endif
+#ifdef CONFIG_CIFS_ACL
+        seq_printf(m, " acl");
 #endif
        seq_putc(m, '\n');
        seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
@@ -220,11 +218,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                                mid_entry = list_entry(tmp3, struct mid_q_entry,
                                        qhead);
                                seq_printf(m, "\tState: %d com: %d pid:"
-                                                " %d tsk: %p mid %d\n",
+                                                " %d cbdata: %p mid %d\n",
                                                mid_entry->midState,
                                                (int)mid_entry->command,
                                                mid_entry->pid,
-                                                mid_entry->tsk,
+                                                mid_entry->callback_data,
                                                mid_entry->mid);
                        }
                        spin_unlock(&GlobalMid_Lock);
@@ -333,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                                atomic_read(&totSmBufAllocCount));
 #endif /* CONFIG_CIFS_STATS2 */
-        seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
+        seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
        seq_printf(m,
                "\n%d session %d share reconnects\n",
                tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27fd..7ed36536e754 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -255,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 }
-static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
-                                struct list_head *mntlist)
-{
-        /* stolen from afs code */
-        int err;
-        mntget(newmnt);
-        err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
-        switch (err) {
-        case 0:
-                path_put(&nd->path);
-                nd->path.mnt = newmnt;
-                nd->path.dentry = dget(newmnt->mnt_root);
-                schedule_delayed_work(&cifs_dfs_automount_task,
-                                      cifs_dfs_mountpoint_expiry_timeout);
-                break;
-        case -EBUSY:
-                /* someone else made a mount here whilst we were busy */
-                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path))
-                        ;
-                err = 0;
-        default:
-                mntput(newmnt);
-                break;
-        }
-        return err;
-}
 static void dump_referral(const struct dfs_info3_param *ref)
 {
        cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -293,45 +264,43 @@ static void dump_referral(const struct dfs_info3_param *ref)
                                ref->path_consumed);
 }
+/*
-static void*
+ * Create a vfsmount that we can automount
-cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+ */
+static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 {
        struct dfs_info3_param *referrals = NULL;
        unsigned int num_referrals = 0;
        struct cifs_sb_info *cifs_sb;
        struct cifsSesInfo *ses;
-        char *full_path = NULL;
+        char *full_path;
        int xid, i;
-        int rc = 0;
+        int rc;
-        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        struct vfsmount *mnt;
        struct tcon_link *tlink;
        cFYI(1, "in %s", __func__);
-        BUG_ON(IS_ROOT(dentry));
+        BUG_ON(IS_ROOT(mntpt));
        xid = GetXid();
-        dput(nd->path.dentry);
-        nd->path.dentry = dget(dentry);
        /*
         * The MSDFS spec states that paths in DFS referral requests and
         * responses must be prefixed by a single '\' character instead of
         * the double backslashes usually used in the UNC. This function
         * gives us the latter, so we must adjust the result.
         */
-        full_path = build_path_from_dentry(dentry);
+        mnt = ERR_PTR(-ENOMEM);
-        if (full_path == NULL) {
+        full_path = build_path_from_dentry(mntpt);
-                rc = -ENOMEM;
+        if (full_path == NULL)
-                goto out_err;
+                goto free_xid;
-        }
-        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+        cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
        tlink = cifs_sb_tlink(cifs_sb);
+        mnt = ERR_PTR(-EINVAL);
        if (IS_ERR(tlink)) {
-                rc = PTR_ERR(tlink);
+                mnt = ERR_CAST(tlink);
-                goto out_err;
+                goto free_full_path;
        }
        ses = tlink_tcon(tlink)->ses;
@@ -341,46 +310,63 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        cifs_put_tlink(tlink);
+        mnt = ERR_PTR(-ENOENT);
        for (i = 0; i < num_referrals; i++) {
                int len;
-                dump_referral(referrals+i);
+                dump_referral(referrals + i);
                /* connect to a node */
                len = strlen(referrals[i].node_name);
                if (len < 2) {
                        cERROR(1, "%s: Net Address path too short: %s",
                                        __func__, referrals[i].node_name);
-                        rc = -EINVAL;
+                        mnt = ERR_PTR(-EINVAL);
-                        goto out_err;
+                        break;
                }
                mnt = cifs_dfs_do_refmount(cifs_sb,
                                full_path, referrals + i);
                cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
                                        referrals[i].node_name, mnt);
-                /* complete mount procedure if we accured submount */
                if (!IS_ERR(mnt))
-                        break;
+                        goto success;
        }
-        /* we need it cause for() above could exit without valid submount */
+        /* no valid submounts were found; return error from get_dfs_path() by
-        rc = PTR_ERR(mnt);
+         * preference */
-        if (IS_ERR(mnt))
+        if (rc != 0)
-                goto out_err;
+                mnt = ERR_PTR(rc);
-        rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
-out:
+success:
-        FreeXid(xid);
        free_dfs_info_array(referrals, num_referrals);
+free_full_path:
        kfree(full_path);
+free_xid:
+        FreeXid(xid);
        cFYI(1, "leaving %s" , __func__);
-        return ERR_PTR(rc);
+        return mnt;
-out_err:
+}
-        path_put(&nd->path);
-        goto out;
+/*
+ * Attempt to automount the referral
+ */
+struct vfsmount *cifs_dfs_d_automount(struct path *path)
+{
+        struct vfsmount *newmnt;
+        cFYI(1, "in %s", __func__);
+        newmnt = cifs_dfs_do_automount(path->dentry);
+        if (IS_ERR(newmnt)) {
+                cFYI(1, "leaving %s [automount failed]" , __func__);
+                return newmnt;
+        }
+        mntget(newmnt); /* prevent immediate expiration */
+        mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+        schedule_delayed_work(&cifs_dfs_automount_task,
+                              cifs_dfs_mountpoint_expiry_timeout);
+        cFYI(1, "leaving %s [ok]" , __func__);
+        return newmnt;
 }
 const struct inode_operations cifs_dfs_referral_inode_operations = {
-        .follow_link = cifs_dfs_follow_mountpoint,
 };
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7852cd677051..ac51cd2d33ae 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -40,6 +40,7 @@
 #define CIFS_MOUNT_FSCACHE      0x8000 /* local caching enabled */
 #define CIFS_MOUNT_MF_SYMLINKS  0x10000 /* Minshall+French Symlinks enabled */
 #define CIFS_MOUNT_MULTIUSER    0x20000 /* multiuser mount */
+#define CIFS_MOUNT_STRICT_IO    0x40000 /* strict cache mode */
 struct cifs_sb_info {
        struct rb_root tlink_tree;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 87044906cd1f..4dfba8283165 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,6 +98,8 @@ struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 {
        struct TCP_Server_Info *server = sesInfo->server;
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        char *description, *dp;
        size_t desc_len;
        struct key *spnego_key;
@@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        /* add the server address */
-        if (server->addr.sockAddr.sin_family == AF_INET)
+        if (server->dstaddr.ss_family == AF_INET)
-                sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
+                sprintf(dp, "ip4=%pI4", &sa->sin_addr);
-        else if (server->addr.sockAddr.sin_family == AF_INET6)
+        else if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
+                sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
        else
                goto out;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 430f510a1720..fc0fd4fde306 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
        int charlen, outlen = 0;
        int maxwords = maxbytes / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
+        __u16 ftmp;
-        for (i = 0; i < maxwords && from[i]; i++) {
+        for (i = 0; i < maxwords; i++) {
-                charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
+                ftmp = get_unaligned_le16(&from[i]);
-                                             NLS_MAX_CHARSET_SIZE);
+                if (ftmp == 0)
+                        break;
+                charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
                if (charlen > 0)
                        outlen += charlen;
                else
@@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 }
 /*
- * cifs_mapchar - convert a little-endian char to proper char in codepage
+ * cifs_mapchar - convert a host-endian char to proper char in codepage
 * @target - where converted character should be copied
- * @src_char - 2 byte little-endian source character
+ * @src_char - 2 byte host-endian source character
 * @cp - codepage to which character should be converted
 * @mapchar - should character be mapped according to mapchars mount option?
 *
@@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
 */
 static int
-cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
+cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
             bool mapchar)
 {
        int len = 1;
@@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
         *     build_path_from_dentry are modified, as they use slash as
         *     separator.
         */
-        switch (le16_to_cpu(src_char)) {
+        switch (src_char) {
        case UNI_COLON:
                *target = ':';
                break;
@@ -109,8 +113,7 @@ out:
        return len;
 cp_convert:
-        len = cp->uni2char(le16_to_cpu(src_char), target,
+        len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
-                           NLS_MAX_CHARSET_SIZE);
        if (len <= 0) {
                *target = '?';
                len = 1;
@@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
        int nullsize = nls_nullsize(codepage);
        int fromwords = fromlen / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
+        __u16 ftmp;
        /*
         * because the chars can be of varying widths, we need to take care
@@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
         */
        safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
-        for (i = 0; i < fromwords && from[i]; i++) {
+        for (i = 0; i < fromwords; i++) {
+                ftmp = get_unaligned_le16(&from[i]);
+                if (ftmp == 0)
+                        break;
                /*
                 * check to see if converting this character might make the
                 * conversion bleed into the null terminator
                 */
                if (outlen >= safelen) {
-                        charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
+                        charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
                        if ((outlen + charlen) > (tolen - nullsize))
                                break;
                }
                /* put converted char into 'to' buffer */
-                charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
+                charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
                outlen += charlen;
        }
@@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 {
        int charlen;
        int i;
-        wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */
+        wchar_t wchar_to; /* needed to quiet sparse */
        for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+                charlen = codepage->char2uni(from, len, &wchar_to);
-                /* works for 2.4.0 kernel or later */
-                charlen = codepage->char2uni(from, len, &wchar_to[i]);
                if (charlen < 1) {
-                        cERROR(1, "strtoUCS: char2uni of %d returned %d",
+                        cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
-                                (int)*from, charlen);
+                                *from, charlen);
                        /* A question mark */
-                        to[i] = cpu_to_le16(0x003f);
+                        wchar_to = 0x003f;
                        charlen = 1;
-                } else
+                }
-                        to[i] = cpu_to_le16(wchar_to[i]);
+                put_unaligned_le16(wchar_to, &to[i]);
        }
-        to[i] = 0;
+        put_unaligned_le16(0, &to[i]);
        return i;
 }
@@ -252,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
        return dst;
 }
+/*
+ * Convert 16 bit Unicode pathname to wire format from string in current code
+ * page. Conversion may involve remapping up the six characters that are
+ * only legal in POSIX-like OS (if they are present in the string). Path
+ * names are little endian 16 bit Unicode on the wire
+ */
+int
+cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+                 const struct nls_table *cp, int mapChars)
+{
+        int i, j, charlen;
+        int len_remaining = maxlen;
+        char src_char;
+        __u16 temp;
+        if (!mapChars)
+                return cifs_strtoUCS(target, source, PATH_MAX, cp);
+        for (i = 0, j = 0; i < maxlen; j++) {
+                src_char = source[i];
+                switch (src_char) {
+                case 0:
+                        put_unaligned_le16(0, &target[j]);
+                        goto ctoUCS_out;
+                case ':':
+                        temp = UNI_COLON;
+                        break;
+                case '*':
+                        temp = UNI_ASTERIK;
+                        break;
+                case '?':
+                        temp = UNI_QUESTION;
+                        break;
+                case '<':
+                        temp = UNI_LESSTHAN;
+                        break;
+                case '>':
+                        temp = UNI_GRTRTHAN;
+                        break;
+                case '|':
+                        temp = UNI_PIPE;
+                        break;
+                /*
+                 * FIXME: We can not handle remapping backslash (UNI_SLASH)
+                 * until all the calls to build_path_from_dentry are modified,
+                 * as they use backslash as separator.
+                 */
+                default:
+                        charlen = cp->char2uni(source+i, len_remaining,
+                                                &temp);
+                        /*
+                         * if no match, use question mark, which at least in
+                         * some cases serves as wild card
+                         */
+                        if (charlen < 1) {
+                                temp = 0x003f;
+                                charlen = 1;
+                        }
+                        len_remaining -= charlen;
+                        /*
+                         * character may take more than one byte in the source
+                         * string, but will take exactly two bytes in the
+                         * target string
+                         */
+                        i += charlen;
+                        continue;
+                }
+                put_unaligned_le16(temp, &target[j]);
+                i++; /* move to next char in source string */
+                len_remaining--;
+        }
+ctoUCS_out:
+        return i;
+}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index a437ec391a01..1e7636b145a8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -41,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 ;
-/* security id for everyone */
+/* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
        1, 1, {0, 0, 0, 0, 0, 1}, {0} };
+/* security id for Authenticated Users system group */
+static const struct cifs_sid sid_authusers = {
+        1, 1, {0, 0, 0, 0, 0, 5}, {11} };
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
@@ -365,7 +368,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        if (num_aces  > 0) {
                umode_t user_mask = S_IRWXU;
                umode_t group_mask = S_IRWXG;
-                umode_t other_mask = S_IRWXO;
+                umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
                ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
                                GFP_KERNEL);
@@ -390,6 +393,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                                                     ppace[i]->type,
                                                     &fattr->cf_mode,
                                                     &other_mask);
+                        if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+                                access_flags_to_mode(ppace[i]->access_req,
+                                                     ppace[i]->type,
+                                                     &fattr->cf_mode,
+                                                     &other_mask);
 /*                      memcpy((void *)(&(cifscred->aces[i])),
                                (void *)ppace[i],
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index f856732161ab..66f3d50d0676 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -72,6 +72,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
        return 0;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
                  __u32 *pexpected_response_sequence_number)
 {
@@ -84,14 +85,12 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                        cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
        rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
        if (rc)
@@ -149,6 +148,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
        return rc;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
                   __u32 *pexpected_response_sequence_number)
 {
@@ -162,14 +162,12 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                                cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
        rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
        if (rc)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8e21e0fe65d5..a8323f1dc1c4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -77,7 +77,11 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
 module_param(cifs_max_pending, int, 0);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
                                   "Default: 50 Range: 2 to 256");
+unsigned short echo_retries = 5;
+module_param(echo_retries, ushort, 0644);
+MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
+                               "reconnecting server. Default: 5. 0 means "
+                               "never reconnect.");
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
@@ -174,6 +178,12 @@ cifs_read_super(struct super_block *sb, void *data,
                goto out_no_root;
        }
+        /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
+        if (cifs_sb_master_tcon(cifs_sb)->nocase)
+                sb->s_d_op = &cifs_ci_dentry_ops;
+        else
+                sb->s_d_op = &cifs_dentry_ops;
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cFYI(1, "export ops supported");
@@ -329,6 +339,8 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_inode->invalid_mapping = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
+        cifs_inode->uniqueid = 0;
+        cifs_inode->createtime = 0;
        /* Can not set i_flags here - they get immediately overwritten
           to zero by the VFS */
@@ -361,18 +373,19 @@ cifs_evict_inode(struct inode *inode)
 static void
 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 {
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        seq_printf(s, ",addr=");
-        switch (server->addr.sockAddr.sin_family) {
+        switch (server->dstaddr.ss_family) {
        case AF_INET:
-                seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr);
+                seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
                break;
        case AF_INET6:
-                seq_printf(s, "%pI6",
+                seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
-                           &server->addr.sockAddr6.sin6_addr.s6_addr);
+                if (sa6->sin6_scope_id)
-                if (server->addr.sockAddr6.sin6_scope_id)
+                        seq_printf(s, "%%%u", sa6->sin6_scope_id);
-                        seq_printf(s, "%%%u",
-                                   server->addr.sockAddr6.sin6_scope_id);
                break;
        default:
                seq_printf(s, "(unknown)");
@@ -720,6 +733,25 @@ const struct file_operations cifs_file_ops = {
        .setlease = cifs_setlease,
 };
+const struct file_operations cifs_file_strict_ops = {
+        .read = do_sync_read,
+        .write = do_sync_write,
+        .aio_read = cifs_strict_readv,
+        .aio_write = cifs_file_aio_write,
+        .open = cifs_open,
+        .release = cifs_close,
+        .lock = cifs_lock,
+        .fsync = cifs_strict_fsync,
+        .flush = cifs_flush,
+        .mmap = cifs_file_strict_mmap,
+        .splice_read = generic_file_splice_read,
+        .llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+        .unlocked_ioctl = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+        .setlease = cifs_setlease,
+};
 const struct file_operations cifs_file_direct_ops = {
        /* no aio, no readv -
           BB reevaluate whether they can be done with directio, no cache */
@@ -738,6 +770,7 @@ const struct file_operations cifs_file_direct_ops = {
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
 };
 const struct file_operations cifs_file_nobrl_ops = {
        .read = do_sync_read,
        .write = do_sync_write,
@@ -756,6 +789,24 @@ const struct file_operations cifs_file_nobrl_ops = {
        .setlease = cifs_setlease,
 };
+const struct file_operations cifs_file_strict_nobrl_ops = {
+        .read = do_sync_read,
+        .write = do_sync_write,
+        .aio_read = cifs_strict_readv,
+        .aio_write = cifs_file_aio_write,
+        .open = cifs_open,
+        .release = cifs_close,
+        .fsync = cifs_strict_fsync,
+        .flush = cifs_flush,
+        .mmap = cifs_file_strict_mmap,
+        .splice_read = generic_file_splice_read,
+        .llseek = cifs_llseek,
+#ifdef CONFIG_CIFS_POSIX
+        .unlocked_ioctl = cifs_ioctl,
+#endif /* CONFIG_CIFS_POSIX */
+        .setlease = cifs_setlease,
+};
 const struct file_operations cifs_file_direct_nobrl_ops = {
        /* no mmap, no aio, no readv -
           BB reevaluate whether they can be done with directio, no cache */
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 897b2b2b28b5..f23206d46531 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
                       struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
 extern int cifs_revalidate_dentry(struct dentry *);
+extern void cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
@@ -72,19 +73,25 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
 /* Functions related to files and directories */
 extern const struct file_operations cifs_file_ops;
 extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
-extern const struct file_operations cifs_file_nobrl_ops;
+extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
-extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
+extern const struct file_operations cifs_file_direct_nobrl_ops;
+extern const struct file_operations cifs_file_strict_nobrl_ops;
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
-                         size_t read_size, loff_t *poffset);
+                              size_t read_size, loff_t *poffset);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+                                 unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                         size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, int);
+extern int cifs_strict_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
@@ -93,6 +100,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
 extern const struct dentry_operations cifs_dentry_ops;
 extern const struct dentry_operations cifs_ci_dentry_ops;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
+#else
+#define cifs_dfs_d_automount NULL
+#endif
 /* Functions related to symlinks */
 extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
 extern void cifs_put_link(struct dentry *direntry,
@@ -112,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.68"
+#define CIFS_VERSION   "1.69"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7136c0c3e2f9..5bfb75346cb0 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -161,35 +161,24 @@ struct TCP_Server_Info {
        int srv_count; /* reference counter */
        /* 15 character server name + 0x20 16th byte indicating type = srv */
        char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+        enum statusEnum tcpStatus; /* what we think the status is */
        char *hostname; /* hostname portion of UNC string */
        struct socket *ssocket;
-        union {
+        struct sockaddr_storage dstaddr;
-                struct sockaddr_in sockAddr;
-                struct sockaddr_in6 sockAddr6;
-        } addr;
        struct sockaddr_storage srcaddr; /* locally bind to this IP */
        wait_queue_head_t response_q;
        wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
        struct list_head pending_mid_q;
-        void *Server_NlsInfo;   /* BB - placeholder for future NLS info  */
-        unsigned short server_codepage; /* codepage for the server    */
-        enum protocolEnum protocolType;
-        char versionMajor;
-        char versionMinor;
-        bool svlocal:1;                 /* local server or remote */
        bool noblocksnd;                /* use blocking sendmsg */
        bool noautotune;                /* do not autotune send buf sizes */
        bool tcp_nodelay;
        atomic_t inFlight;  /* number of requests on the wire to server */
-#ifdef CONFIG_CIFS_STATS2
-        atomic_t inSend; /* requests trying to send */
-        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
-#endif
-        enum statusEnum tcpStatus; /* what we think the status is */
        struct mutex srv_mutex;
        struct task_struct *tsk;
        char server_GUID[16];
        char secMode;
+        bool session_estab; /* mark when very first sess is established */
+        u16 dialect; /* dialect index that server chose */
        enum securityEnum secType;
        unsigned int maxReq;    /* Clients should submit no more */
        /* than maxReq distinct unanswered SMBs to the server when using  */
@@ -202,28 +191,29 @@ struct TCP_Server_Info {
        unsigned int max_vcs;   /* maximum number of smb sessions, at least
                                   those that can be specified uniquely with
                                   vcnumbers */
-        char sessid[4];         /* unique token id for this session */
-        /* (returned on Negotiate */
        int capabilities; /* allow selective disabling of caps by smb sess */
        int timeAdj;  /* Adjust for difference in server time zone in sec */
        __u16 CurrentMid;         /* multiplex id - rotating counter */
        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
-        __u32 sequence_number; /* needed for CIFS PDU signature */
+        __u32 sequence_number; /* for signing, protected by srv_mutex */
        struct session_key session_key;
        unsigned long lstrp; /* when we got last response from this server */
-        u16 dialect; /* dialect index that server chose */
        struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
        /* extended security flavors that server supports */
+        bool    sec_ntlmssp;            /* supports NTLMSSP */
+        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
        bool    sec_kerberos;           /* supports plain Kerberos */
        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
-        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
+        struct delayed_work     echo; /* echo ping workqueue job */
-        bool    sec_ntlmssp;            /* supports NTLMSSP */
-        bool session_estab; /* mark when very first sess is established */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
+#ifdef CONFIG_CIFS_STATS2
+        atomic_t inSend; /* requests trying to send */
+        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
+#endif
 };
 /*
@@ -449,13 +439,14 @@ struct cifsInodeInfo {
        /* BB add in lists for dirty pages i.e. write caching info for oplock */
        struct list_head openFileList;
        __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
-        unsigned long time;     /* jiffies of last update/check of inode */
+        bool clientCanCacheRead;        /* read oplock */
-        bool clientCanCacheRead:1;      /* read oplock */
+        bool clientCanCacheAll;         /* read and writebehind oplock */
-        bool clientCanCacheAll:1;       /* read and writebehind oplock */
+        bool delete_pending;            /* DELETE_ON_CLOSE is set */
-        bool delete_pending:1;          /* DELETE_ON_CLOSE is set */
+        bool invalid_mapping;           /* pagecache is invalid */
-        bool invalid_mapping:1;         /* pagecache is invalid */
+        unsigned long time;             /* jiffies of last update of inode */
        u64  server_eof;                /* current file size on server */
        u64  uniqueid;                  /* server inode number */
+        u64  createtime;                /* creation time on server */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie *fscache;
 #endif
@@ -510,6 +501,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
 #endif
+struct mid_q_entry;
+/*
+ * This is the prototype for the mid callback function. When creating one,
+ * take special care to avoid deadlocks. Things to bear in mind:
+ *
+ * - it will be called by cifsd
+ * - the GlobalMid_Lock will be held
+ * - the mid will be removed from the pending_mid_q list
+ */
+typedef void (mid_callback_t)(struct mid_q_entry *mid);
 /* one of these for every pending CIFS request to the server */
 struct mid_q_entry {
        struct list_head qhead; /* mids waiting on reply from this server */
@@ -521,7 +524,8 @@ struct mid_q_entry {
        unsigned long when_sent; /* time when smb send finished */
        unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
-        struct task_struct *tsk;        /* task waiting for response */
+        mid_callback_t *callback; /* call completion callback */
+        void *callback_data;      /* general purpose pointer for callback */
        struct smb_hdr *resp_buf;       /* response buffer */
        int midState;   /* wish this were enum but can not pass to wait_event */
        __u8 command;   /* smb command code */
@@ -576,6 +580,7 @@ struct cifs_fattr {
        u64             cf_uniqueid;
        u64             cf_eof;
        u64             cf_bytes;
+        u64             cf_createtime;
        uid_t           cf_uid;
        gid_t           cf_gid;
        umode_t         cf_mode;
@@ -623,12 +628,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFS_IOVEC            4    /* array of response buffers */
 /* Type of Request to SendReceive2 */
-#define   CIFS_STD_OP           0    /* normal request timeout */
+#define   CIFS_BLOCKING_OP      1    /* operation can block */
-#define   CIFS_LONG_OP          1    /* long op (up to 45 sec, oplock time) */
+#define   CIFS_ASYNC_OP         2    /* do not wait for response */
-#define   CIFS_VLONG_OP         2    /* sloow op - can take up to 180 seconds */
+#define   CIFS_TIMEOUT_MASK 0x003    /* only one of above set in req */
-#define   CIFS_BLOCKING_OP      4    /* operation can block */
-#define   CIFS_ASYNC_OP         8    /* do not wait for response */
-#define   CIFS_TIMEOUT_MASK 0x00F    /* only one of 5 above set in req */
 #define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
 #define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
 #define   CIFS_NO_RESP      0x040    /* no response buffer required */
@@ -791,6 +793,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+/* reconnect after this many failed echo attempts */
+GLOBAL_EXTERN unsigned short echo_retries;
 void cifs_oplock_break(struct work_struct *work);
 void cifs_oplock_break_get(struct cifsFileInfo *cfile);
 void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de36b09763a8..b5c8cc5d7a7f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -23,6 +23,7 @@
 #define _CIFSPDU_H
 #include <net/sock.h>
+#include <asm/unaligned.h>
 #include "smbfsctl.h"
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -50,6 +51,7 @@
 #define SMB_COM_SETATTR               0x09 /* trivial response */
 #define SMB_COM_LOCKING_ANDX          0x24 /* trivial response */
 #define SMB_COM_COPY                  0x29 /* trivial rsp, fail filename ignrd*/
+#define SMB_COM_ECHO                  0x2B /* echo request */
 #define SMB_COM_OPEN_ANDX             0x2D /* Legacy open for old servers */
 #define SMB_COM_READ_ANDX             0x2E
 #define SMB_COM_WRITE_ANDX            0x2F
@@ -425,11 +427,49 @@ struct smb_hdr {
        __u16 Mid;
        __u8 WordCount;
 } __attribute__((packed));
-/* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
-#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
+                         (2 * (smb_var)->WordCount))
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
-#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
+#define pByteArea(smb_var) (BCC(smb_var) + 2)
+/* get the converted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc(struct smb_hdr *hdr)
+{
+        __u16 *bc_ptr = (__u16 *)BCC(hdr);
+        return get_unaligned(bc_ptr);
+}
+/* get the unconverted ByteCount for a SMB packet and return it */
+static inline __u16
+get_bcc_le(struct smb_hdr *hdr)
+{
+        __le16 *bc_ptr = (__le16 *)BCC(hdr);
+        return get_unaligned_le16(bc_ptr);
+}
+/* set the ByteCount for a SMB packet in host-byte order */
+static inline void
+put_bcc(__u16 count, struct smb_hdr *hdr)
+{
+        __u16 *bc_ptr = (__u16 *)BCC(hdr);
+        put_unaligned(count, bc_ptr);
+}
+/* set the ByteCount for a SMB packet in little-endian */
+static inline void
+put_bcc_le(__u16 count, struct smb_hdr *hdr)
+{
+        __le16 *bc_ptr = (__le16 *)BCC(hdr);
+        put_unaligned_le16(count, bc_ptr);
+}
 /*
 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -760,6 +800,20 @@ typedef struct smb_com_tconx_rsp_ext {
 *
 */
+typedef struct smb_com_echo_req {
+        struct  smb_hdr hdr;
+        __le16  EchoCount;
+        __le16  ByteCount;
+        char    Data[1];
+} __attribute__((packed)) ECHO_REQ;
+typedef struct smb_com_echo_rsp {
+        struct  smb_hdr hdr;
+        __le16  SequenceNumber;
+        __le16  ByteCount;
+        char    Data[1];
+} __attribute__((packed)) ECHO_RSP;
 typedef struct smb_com_logoff_andx_req {
        struct smb_hdr hdr;     /* wct = 2 */
        __u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e6d1481b16c1..982895fa7615 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -61,6 +61,12 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
                const char *fullpath, const struct dfs_info3_param *ref,
                char **devname);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
+extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
+                                        struct TCP_Server_Info *server);
+extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
+extern int cifs_call_async(struct TCP_Server_Info *server,
+                           struct smb_hdr *in_buf, mid_callback_t *callback,
+                           void *cbdata);
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
                        struct smb_hdr * /* input */ ,
                        struct smb_hdr * /* out */ ,
@@ -347,12 +353,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
                        const __u16 netfid, const __u64 len,
                        const __u64 offset, const __u32 numUnlock,
                        const __u32 numLock, const __u8 lockType,
-                        const bool waitFlag);
+                        const bool waitFlag, const __u8 oplock_level);
 extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                        const __u16 smb_file_id, const int get_flag,
                        const __u64 len, struct file_lock *,
                        const __u16 lock_type, const bool waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBEcho(struct TCP_Server_Info *server);
 extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
 extern struct cifsSesInfo *sesInfoAlloc(void);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 67acfb3acad2..3106f5e5c633 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -331,37 +331,35 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
 static int validate_t2(struct smb_t2_rsp *pSMB)
 {
-        int rc = -EINVAL;
+        unsigned int total_size;
-        int total_size;
-        char *pBCC;
+        /* check for plausible wct */
+        if (pSMB->hdr.WordCount < 10)
+                goto vt2_err;
-        /* check for plausible wct, bcc and t2 data and parm sizes */
        /* check for parm and data offset going beyond end of smb */
-        if (pSMB->hdr.WordCount >= 10) {
+        if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
-                if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) &&
+            get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
-                   (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) {
+                goto vt2_err;
-                        /* check that bcc is at least as big as parms + data */
-                        /* check that bcc is less than negotiated smb buffer */
+        /* check that bcc is at least as big as parms + data */
-                        total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount);
+        /* check that bcc is less than negotiated smb buffer */
-                        if (total_size < 512) {
+        total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
-                                total_size +=
+        if (total_size >= 512)
-                                        le16_to_cpu(pSMB->t2_rsp.DataCount);
+                goto vt2_err;
-                                /* BCC le converted in SendReceive */
-                                pBCC = (pSMB->hdr.WordCount * 2) +
+        total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
-                                        sizeof(struct smb_hdr) +
+        if (total_size > get_bcc(&pSMB->hdr) ||
-                                        (char *)pSMB;
+            total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
-                                if ((total_size <= (*(u16 *)pBCC)) &&
+                goto vt2_err;
-                                   (total_size <
-                                        CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) {
+        return 0;
-                                        return 0;
+vt2_err:
-                                }
-                        }
-                }
-        }
        cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
                sizeof(struct smb_t2_rsp) + 16);
-        return rc;
+        return -EINVAL;
 }
 int
 CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 {
@@ -401,15 +399,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
                cFYI(1, "Kerberos only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        }
+        } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
                cFYI(1, "NTLMSSP only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
-#endif
        count = 0;
        for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -455,7 +450,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
                                (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
                server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
-                GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
                /* even though we do not use raw we might as well set this
                accurately, in case we ever find a need for it */
                if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -569,7 +563,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
        cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
-        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
        server->timeAdj *= 60;
@@ -709,6 +702,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        return rc;
 }
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+cifs_echo_callback(struct mid_q_entry *mid)
+{
+        struct TCP_Server_Info *server = mid->callback_data;
+        DeleteMidQEntry(mid);
+        atomic_dec(&server->inFlight);
+        wake_up(&server->request_q);
+}
+int
+CIFSSMBEcho(struct TCP_Server_Info *server)
+{
+        ECHO_REQ *smb;
+        int rc = 0;
+        cFYI(1, "In echo request");
+        rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
+        if (rc)
+                return rc;
+        /* set up echo request */
+        smb->hdr.Tid = cpu_to_le16(0xffff);
+        smb->hdr.WordCount = 1;
+        put_unaligned_le16(1, &smb->EchoCount);
+        put_bcc_le(1, &smb->hdr);
+        smb->Data[0] = 'a';
+        smb->hdr.smb_buf_length += 3;
+        rc = cifs_call_async(server, (struct smb_hdr *)smb,
+                                cifs_echo_callback, server);
+        if (rc)
+                cFYI(1, "Echo request failed: %d", rc);
+        cifs_small_buf_release(smb);
+        return rc;
+}
 int
 CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 {
@@ -1196,7 +1236,7 @@ OldOpenRetry:
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+                        (struct smb_hdr *)pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
                cFYI(1, "Error in Open = %d", rc);
@@ -1309,7 +1349,7 @@ openRetry:
        pSMB->ByteCount = cpu_to_le16(count);
        /* long_op set to 1 to allow for oplock break timeouts */
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+                        (struct smb_hdr *)pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
                cFYI(1, "Error in Open = %d", rc);
@@ -1391,7 +1431,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        iov[0].iov_base = (char *)pSMB;
        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
-                         &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR);
+                         &resp_buf_type, CIFS_LOG_ERROR);
        cifs_stats_inc(&tcon->num_reads);
        pSMBr = (READ_RSP *)iov[0].iov_base;
        if (rc) {
@@ -1666,7 +1706,8 @@ int
 CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
            const __u16 smb_file_id, const __u64 len,
            const __u64 offset, const __u32 numUnlock,
-            const __u32 numLock, const __u8 lockType, const bool waitFlag)
+            const __u32 numLock, const __u8 lockType,
+            const bool waitFlag, const __u8 oplock_level)
 {
        int rc = 0;
        LOCK_REQ *pSMB = NULL;
@@ -1694,6 +1735,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        pSMB->NumberOfLocks = cpu_to_le16(numLock);
        pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
        pSMB->LockType = lockType;
+        pSMB->OplockLevel = oplock_level;
        pSMB->AndXCommand = 0xFF;       /* none */
        pSMB->Fid = smb_file_id; /* netfid stays le */
@@ -3090,7 +3132,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
        rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
-                         CIFS_STD_OP);
+                         0);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
                cFYI(1, "Send error in QuerySecDesc = %d", rc);
@@ -5565,7 +5607,7 @@ QAllEAsRetry:
        }
        /* make sure list_len doesn't go past end of SMB */
-        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
+        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
        if ((char *)ea_response_data + list_len > end_of_smb) {
                cFYI(1, "EA list appears to go beyond SMB");
                rc = -EIO;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index cc1a8604a790..18d3c7724d6e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -52,6 +52,9 @@
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
+/* SMB echo "timeout" -- FIXME: tunable? */
+#define SMB_ECHO_INTERVAL (60 * HZ)
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                         unsigned char *p24);
@@ -64,8 +67,8 @@ struct smb_vol {
        char *UNC;
        char *UNCip;
        char *iocharset;  /* local code page for mapping to and from Unicode */
-        char source_rfc1001_name[16]; /* netbios name of client */
+        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
-        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
        uid_t cred_uid;
        uid_t linux_uid;
        gid_t linux_gid;
@@ -115,8 +118,8 @@ struct smb_vol {
 #define TLINK_ERROR_EXPIRE      (1 * HZ)
 #define TLINK_IDLE_EXPIRE       (600 * HZ)
-static int ipv4_connect(struct TCP_Server_Info *server);
+static int ip_connect(struct TCP_Server_Info *server);
-static int ipv6_connect(struct TCP_Server_Info *server);
+static int generic_ip_connect(struct TCP_Server_Info *server);
 static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
 static void cifs_prune_tlinks(struct work_struct *work);
@@ -152,6 +155,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        /* before reconnecting the tcp session, mark the smb session (uid)
                and the tid bad so they are not used until reconnected */
+        cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &server->smb_ses_list) {
                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
@@ -163,7 +167,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
                }
        }
        spin_unlock(&cifs_tcp_ses_lock);
        /* do not want to be sending data on a socket we are freeing */
+        cFYI(1, "%s: tearing down socket", __func__);
        mutex_lock(&server->srv_mutex);
        if (server->ssocket) {
                cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -180,30 +186,27 @@ cifs_reconnect(struct TCP_Server_Info *server)
        kfree(server->session_key.response);
        server->session_key.response = NULL;
        server->session_key.len = 0;
+        server->lstrp = jiffies;
+        mutex_unlock(&server->srv_mutex);
+        /* mark submitted MIDs for retry and issue callback */
+        cFYI(1, "%s: issuing mid callbacks", __func__);
        spin_lock(&GlobalMid_Lock);
-        list_for_each(tmp, &server->pending_mid_q) {
+        list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
-                mid_entry = list_entry(tmp, struct
+                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                                        mid_q_entry,
+                if (mid_entry->midState == MID_REQUEST_SUBMITTED)
-                                        qhead);
-                if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-                                /* Mark other intransit requests as needing
-                                   retry so we do not immediately mark the
-                                   session bad again (ie after we reconnect
-                                   below) as they timeout too */
                        mid_entry->midState = MID_RETRY_NEEDED;
-                }
+                list_del_init(&mid_entry->qhead);
+                mid_entry->callback(mid_entry);
        }
        spin_unlock(&GlobalMid_Lock);
-        mutex_unlock(&server->srv_mutex);
        while ((server->tcpStatus != CifsExiting) &&
               (server->tcpStatus != CifsGood)) {
                try_to_freeze();
-                if (server->addr.sockAddr6.sin6_family == AF_INET6)
-                        rc = ipv6_connect(server);
+                /* we should try only the port we connected to before */
-                else
+                rc = generic_ip_connect(server);
-                        rc = ipv4_connect(server);
                if (rc) {
                        cFYI(1, "reconnect error %d", rc);
                        msleep(3000);
@@ -213,10 +216,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
                        if (server->tcpStatus != CifsExiting)
                                server->tcpStatus = CifsGood;
                        spin_unlock(&GlobalMid_Lock);
-        /*              atomic_set(&server->inFlight,0);*/
-                        wake_up(&server->response_q);
                }
        }
        return rc;
 }
@@ -230,9 +232,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
 static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 {
        struct smb_t2_rsp *pSMBt;
-        int total_data_size;
-        int data_in_this_rsp;
        int remaining;
+        __u16 total_data_size, data_in_this_rsp;
        if (pSMB->Command != SMB_COM_TRANSACTION2)
                return 0;
@@ -246,8 +247,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        pSMBt = (struct smb_t2_rsp *)pSMB;
-        total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
+        total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
-        data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+        data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
        remaining = total_data_size - data_in_this_rsp;
@@ -273,21 +274,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 {
        struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
        struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
-        int total_data_size;
-        int total_in_buf;
-        int remaining;
-        int total_in_buf2;
        char *data_area_of_target;
        char *data_area_of_buf2;
-        __u16 byte_count;
+        int remaining;
+        __u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
-        total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
+        total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
-        if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
+        if (total_data_size !=
+            get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
                cFYI(1, "total data size of primary and secondary t2 differ");
-        }
-        total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
+        total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
        remaining = total_data_size - total_in_buf;
@@ -297,28 +295,28 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        if (remaining == 0) /* nothing to do, ignore */
                return 0;
-        total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
+        total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
        if (remaining < total_in_buf2) {
                cFYI(1, "transact2 2nd response contains too much data");
        }
        /* find end of first SMB data area */
        data_area_of_target = (char *)&pSMBt->hdr.Protocol +
-                                le16_to_cpu(pSMBt->t2_rsp.DataOffset);
+                                get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
        /* validate target area */
-        data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol +
+        data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
-                                        le16_to_cpu(pSMB2->t2_rsp.DataOffset);
+                                get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
        data_area_of_target += total_in_buf;
        /* copy second buffer into end of first buffer */
        memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
        total_in_buf += total_in_buf2;
-        pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf);
+        put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
-        byte_count = le16_to_cpu(BCC_LE(pTargetSMB));
+        byte_count = get_bcc_le(pTargetSMB);
        byte_count += total_in_buf2;
-        BCC_LE(pTargetSMB) = cpu_to_le16(byte_count);
+        put_bcc_le(byte_count, pTargetSMB);
        byte_count = pTargetSMB->smb_buf_length;
        byte_count += total_in_buf2;
@@ -332,7 +330,26 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
                return 0; /* we are done */
        } else /* more responses to go */
                return 1;
+}
+static void
+cifs_echo_request(struct work_struct *work)
+{
+        int rc;
+        struct TCP_Server_Info *server = container_of(work,
+                                        struct TCP_Server_Info, echo.work);
+        /* no need to ping if we got a response recently */
+        if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+                goto requeue_echo;
+        rc = CIFSSMBEcho(server);
+        if (rc)
+                cFYI(1, "Unable to send echo request to server: %s",
+                        server->hostname);
+requeue_echo:
+        queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
 }
 static int
@@ -346,8 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
        struct msghdr smb_msg;
        struct kvec iov;
        struct socket *csocket = server->ssocket;
-        struct list_head *tmp;
+        struct list_head *tmp, *tmp2;
-        struct cifsSesInfo *ses;
        struct task_struct *task_to_wake = NULL;
        struct mid_q_entry *mid_entry;
        char temp;
@@ -400,7 +416,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                smb_msg.msg_control = NULL;
                smb_msg.msg_controllen = 0;
                pdu_length = 4; /* enough to get RFC1001 header */
 incomplete_rcv:
+                if (echo_retries > 0 &&
+                    time_after(jiffies, server->lstrp +
+                                        (echo_retries * SMB_ECHO_INTERVAL))) {
+                        cERROR(1, "Server %s has not responded in %d seconds. "
+                                  "Reconnecting...", server->hostname,
+                                  (echo_retries * SMB_ECHO_INTERVAL / HZ));
+                        cifs_reconnect(server);
+                        csocket = server->ssocket;
+                        wake_up(&server->response_q);
+                        continue;
+                }
                length =
                    kernel_recvmsg(csocket, &smb_msg,
                                &iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -477,7 +506,7 @@ incomplete_rcv:
                         * initialize frame)
                         */
                        cifs_set_port((struct sockaddr *)
-                                        &server->addr.sockAddr, CIFS_PORT);
+                                        &server->dstaddr, CIFS_PORT);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
@@ -560,10 +589,11 @@ incomplete_rcv:
                        continue;
                }
+                mid_entry = NULL;
+                server->lstrp = jiffies;
-                task_to_wake = NULL;
                spin_lock(&GlobalMid_Lock);
-                list_for_each(tmp, &server->pending_mid_q) {
+                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                        if ((mid_entry->mid == smb_buffer->Mid) &&
@@ -604,20 +634,19 @@ incomplete_rcv:
                                mid_entry->resp_buf = smb_buffer;
                                mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-                                task_to_wake = mid_entry->tsk;
                                mid_entry->midState = MID_RESPONSE_RECEIVED;
+                                list_del_init(&mid_entry->qhead);
+                                mid_entry->callback(mid_entry);
 #ifdef CONFIG_CIFS_STATS2
                                mid_entry->when_received = jiffies;
 #endif
-                                /* so we do not time out requests to  server
-                                which is still responding (since server could
-                                be busy but not dead) */
-                                server->lstrp = jiffies;
                                break;
                        }
+                        mid_entry = NULL;
                }
                spin_unlock(&GlobalMid_Lock);
-                if (task_to_wake) {
+                if (mid_entry != NULL) {
                        /* Was previous buf put in mpx struct for multi-rsp? */
                        if (!isMultiRsp) {
                                /* smb buffer will be freed by user thread */
@@ -626,11 +655,10 @@ multi_t2_fnd:
                                else
                                        smallbuf = NULL;
                        }
-                        wake_up_process(task_to_wake);
                } else if (!is_valid_oplock_break(smb_buffer, server) &&
                           !isMultiRsp) {
                        cERROR(1, "No task to wake, unknown frame received! "
-                                   "NumMids %d", midCount.counter);
+                                   "NumMids %d", atomic_read(&midCount));
                        cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
                                      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -678,44 +706,16 @@ multi_t2_fnd:
        if (smallbuf) /* no sense logging a debug message if NULL */
                cifs_small_buf_release(smallbuf);
-        /*
+        if (!list_empty(&server->pending_mid_q)) {
-         * BB: we shouldn't have to do any of this. It shouldn't be
-         * possible to exit from the thread with active SMB sessions
-         */
-        spin_lock(&cifs_tcp_ses_lock);
-        if (list_empty(&server->pending_mid_q)) {
-                /* loop through server session structures attached to this and
-                    mark them dead */
-                list_for_each(tmp, &server->smb_ses_list) {
-                        ses = list_entry(tmp, struct cifsSesInfo,
-                                         smb_ses_list);
-                        ses->status = CifsExiting;
-                        ses->server = NULL;
-                }
-                spin_unlock(&cifs_tcp_ses_lock);
-        } else {
-                /* although we can not zero the server struct pointer yet,
-                since there are active requests which may depnd on them,
-                mark the corresponding SMB sessions as exiting too */
-                list_for_each(tmp, &server->smb_ses_list) {
-                        ses = list_entry(tmp, struct cifsSesInfo,
-                                         smb_ses_list);
-                        ses->status = CifsExiting;
-                }
                spin_lock(&GlobalMid_Lock);
-                list_for_each(tmp, &server->pending_mid_q) {
+                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
-                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                        if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
+                        cFYI(1, "Clearing Mid 0x%x - issuing callback",
-                                cFYI(1, "Clearing Mid 0x%x - waking up ",
                                         mid_entry->mid);
-                                task_to_wake = mid_entry->tsk;
+                        list_del_init(&mid_entry->qhead);
-                                if (task_to_wake)
+                        mid_entry->callback(mid_entry);
-                                        wake_up_process(task_to_wake);
-                        }
                }
                spin_unlock(&GlobalMid_Lock);
-                spin_unlock(&cifs_tcp_ses_lock);
                /* 1/8th of sec is more than enough time for them to exit */
                msleep(125);
        }
@@ -733,18 +733,6 @@ multi_t2_fnd:
                coming home not much else we can do but free the memory */
        }
-        /* last chance to mark ses pointers invalid
-        if there are any pointing to this (e.g
-        if a crazy root user tried to kill cifsd
-        kernel thread explicitly this might happen) */
-        /* BB: This shouldn't be necessary, see above */
-        spin_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &server->smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-                ses->server = NULL;
-        }
-        spin_unlock(&cifs_tcp_ses_lock);
        kfree(server->hostname);
        task_to_wake = xchg(&server->tsk, NULL);
        kfree(server);
@@ -817,11 +805,11 @@ cifs_parse_mount_options(char *options, const char *devname,
         * informational, only used for servers that do not support
         * port 445 and it can be overridden at mount time
         */
-        memset(vol->source_rfc1001_name, 0x20, 15);
+        memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
-        for (i = 0; i < strnlen(nodename, 15); i++)
+        for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
                vol->source_rfc1001_name[i] = toupper(nodename[i]);
-        vol->source_rfc1001_name[15] = 0;
+        vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
        vol->target_rfc1001_name[0] = 0;
@@ -985,13 +973,11 @@ cifs_parse_mount_options(char *options, const char *devname,
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                        } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
                                        CIFSSEC_MUST_SIGN;
                        } else if (strnicmp(value, "ntlmssp", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
-#endif
                        } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
                                        CIFSSEC_MUST_SIGN;
@@ -1116,6 +1102,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if (!strnicmp(data, "uid", 3) && value && *value) {
                        vol->linux_uid = simple_strtoul(value, &value, 0);
                        uid_specified = true;
+                } else if (!strnicmp(data, "cruid", 5) && value && *value) {
+                        vol->cred_uid = simple_strtoul(value, &value, 0);
                } else if (!strnicmp(data, "forceuid", 8)) {
                        override_uid = 1;
                } else if (!strnicmp(data, "noforceuid", 10)) {
@@ -1168,22 +1156,22 @@ cifs_parse_mount_options(char *options, const char *devname,
                        if (!value || !*value || (*value == ' ')) {
                                cFYI(1, "invalid (empty) netbiosname");
                        } else {
-                                memset(vol->source_rfc1001_name, 0x20, 15);
+                                memset(vol->source_rfc1001_name, 0x20,
-                                for (i = 0; i < 15; i++) {
+                                        RFC1001_NAME_LEN);
-                                /* BB are there cases in which a comma can be
+                                /*
-                                valid in this workstation netbios name (and need
+                                 * FIXME: are there cases in which a comma can
-                                special handling)? */
+                                 * be valid in workstation netbios name (and
+                                 * need special handling)?
-                                /* We do not uppercase netbiosname for user */
+                                 */
+                                for (i = 0; i < RFC1001_NAME_LEN; i++) {
+                                        /* don't ucase netbiosname for user */
                                        if (value[i] == 0)
                                                break;
-                                        else
+                                        vol->source_rfc1001_name[i] = value[i];
-                                                vol->source_rfc1001_name[i] =
-                                                                value[i];
                                }
                                /* The string has 16th byte zero still from
                                set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: netbiosname"
                                                " longer than 15 truncated.\n");
                        }
@@ -1193,7 +1181,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                                cFYI(1, "empty server netbiosname specified");
                        } else {
                                /* last byte, type, is 0x20 for servr type */
-                                memset(vol->target_rfc1001_name, 0x20, 16);
+                                memset(vol->target_rfc1001_name, 0x20,
+                                        RFC1001_NAME_LEN_WITH_NULL);
                                for (i = 0; i < 15; i++) {
                                /* BB are there cases in which a comma can be
@@ -1210,7 +1199,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                }
                                /* The string has 16th byte zero still from
                                   set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: server net"
                                        "biosname longer than 15 truncated.\n");
                        }
@@ -1341,10 +1330,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->no_psx_acl = 0;
                } else if (strnicmp(data, "noacl", 5) == 0) {
                        vol->no_psx_acl = 1;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                } else if (strnicmp(data, "locallease", 6) == 0) {
                        vol->local_lease = 1;
-#endif
                } else if (strnicmp(data, "sign", 4) == 0) {
                        vol->secFlg |= CIFSSEC_MUST_SIGN;
                } else if (strnicmp(data, "seal", 4) == 0) {
@@ -1454,35 +1441,71 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
        }
 }
+/*
+ * If no port is specified in addr structure, we try to match with 445 port
+ * and if it fails - with 139 ports. It should be called only if address
+ * families of server and addr are equal.
+ */
+static bool
+match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+        unsigned short int port, *sport;
+        switch (addr->sa_family) {
+        case AF_INET:
+                sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
+                port = ((struct sockaddr_in *) addr)->sin_port;
+                break;
+        case AF_INET6:
+                sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
+                port = ((struct sockaddr_in6 *) addr)->sin6_port;
+                break;
+        default:
+                WARN_ON(1);
+                return false;
+        }
+        if (!port) {
+                port = htons(CIFS_PORT);
+                if (port == *sport)
+                        return true;
+                port = htons(RFC1001_PORT);
+        }
+        return port == *sport;
+}
 static bool
 match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
              struct sockaddr *srcaddr)
 {
-        struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
-        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
        switch (addr->sa_family) {
-        case AF_INET:
+        case AF_INET: {
-                if (addr4->sin_addr.s_addr !=
+                struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
-                    server->addr.sockAddr.sin_addr.s_addr)
+                struct sockaddr_in *srv_addr4 =
-                        return false;
+                                        (struct sockaddr_in *)&server->dstaddr;
-                if (addr4->sin_port &&
-                    addr4->sin_port != server->addr.sockAddr.sin_port)
+                if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
                        return false;
                break;
-        case AF_INET6:
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+                struct sockaddr_in6 *srv_addr6 =
+                                        (struct sockaddr_in6 *)&server->dstaddr;
                if (!ipv6_addr_equal(&addr6->sin6_addr,
-                                     &server->addr.sockAddr6.sin6_addr))
+                                     &srv_addr6->sin6_addr))
-                        return false;
-                if (addr6->sin6_scope_id !=
-                    server->addr.sockAddr6.sin6_scope_id)
                        return false;
-                if (addr6->sin6_port &&
+                if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
-                    addr6->sin6_port != server->addr.sockAddr6.sin6_port)
                        return false;
                break;
        }
+        default:
+                WARN_ON(1);
+                return false; /* don't expect to be here */
+        }
        if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
                return false;
@@ -1549,6 +1572,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
                                   (struct sockaddr *)&vol->srcaddr))
                        continue;
+                if (!match_port(server, addr))
+                        continue;
                if (!match_security(server, vol))
                        continue;
@@ -1575,6 +1601,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
        list_del_init(&server->tcp_ses_list);
        spin_unlock(&cifs_tcp_ses_lock);
+        cancel_delayed_work_sync(&server->echo);
        spin_lock(&GlobalMid_Lock);
        server->tcpStatus = CifsExiting;
        spin_unlock(&GlobalMid_Lock);
@@ -1664,8 +1692,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
        tcp_ses->session_estab = false;
        tcp_ses->sequence_number = 0;
+        tcp_ses->lstrp = jiffies;
        INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
        INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
+        INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
        /*
         * at this point we are the only ones with the pointer
@@ -1681,14 +1711,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                cFYI(1, "attempting ipv6 connect");
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
-                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
+                memcpy(&tcp_ses->dstaddr, sin_server6,
-                        sizeof(struct sockaddr_in6));
+                       sizeof(struct sockaddr_in6));
-                rc = ipv6_connect(tcp_ses);
+        } else
-        } else {
+                memcpy(&tcp_ses->dstaddr, sin_server,
-                memcpy(&tcp_ses->addr.sockAddr, sin_server,
+                       sizeof(struct sockaddr_in));
-                        sizeof(struct sockaddr_in));
-                rc = ipv4_connect(tcp_ses);
+        rc = ip_connect(tcp_ses);
-        }
        if (rc < 0) {
                cERROR(1, "Error connecting to socket. Aborting operation");
                goto out_err_crypto_release;
@@ -1715,6 +1744,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        cifs_fscache_get_client_cookie(tcp_ses);
+        /* queue echo request delayed work */
+        queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
        return tcp_ses;
 out_err_crypto_release:
@@ -1793,6 +1825,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
        int rc = -ENOMEM, xid;
        struct cifsSesInfo *ses;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
        xid = GetXid();
@@ -1836,12 +1870,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        /* new SMB session uses our server ref */
        ses->server = server;
-        if (server->addr.sockAddr6.sin6_family == AF_INET6)
+        if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(ses->serverName, "%pI6",
+                sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
-                        &server->addr.sockAddr6.sin6_addr);
        else
-                sprintf(ses->serverName, "%pI4",
+                sprintf(ses->serverName, "%pI4", &addr->sin_addr);
-                        &server->addr.sockAddr.sin_addr.s_addr);
        if (volume_info->username)
                strncpy(ses->userName, volume_info->username,
@@ -2136,19 +2168,106 @@ bind_socket(struct TCP_Server_Info *server)
 }
 static int
-ipv4_connect(struct TCP_Server_Info *server)
+ip_rfc1001_connect(struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        /*
+         * some servers require RFC1001 sessinit before sending
+         * negprot - BB check reconnection in case where second
+         * sessinit is sent but no second negprot
+         */
+        struct rfc1002_session_packet *ses_init_buf;
+        struct smb_hdr *smb_buf;
+        ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
+                               GFP_KERNEL);
+        if (ses_init_buf) {
+                ses_init_buf->trailer.session_req.called_len = 32;
+                if (server->server_RFC1001_name &&
+                    server->server_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      server->server_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      DEFAULT_CIFS_CALLED_NAME,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.calling_len = 32;
+                /*
+                 * calling name ends in null (byte 16) from old smb
+                 * convention.
+                 */
+                if (server->workstation_RFC1001_name &&
+                    server->workstation_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      server->workstation_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      "LINUX_CIFS_CLNT",
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.scope1 = 0;
+                ses_init_buf->trailer.session_req.scope2 = 0;
+                smb_buf = (struct smb_hdr *)ses_init_buf;
+                /* sizeof RFC1002_SESSION_REQUEST with no scope */
+                smb_buf->smb_buf_length = 0x81000044;
+                rc = smb_send(server, smb_buf, 0x44);
+                kfree(ses_init_buf);
+                /*
+                 * RFC1001 layer in at least one server
+                 * requires very short break before negprot
+                 * presumably because not expecting negprot
+                 * to follow so fast.  This is a simple
+                 * solution that works without
+                 * complicating the code and causes no
+                 * significant slowing down on mount
+                 * for everyone else
+                 */
+                usleep_range(1000, 2000);
+        }
+        /*
+         * else the negprot may still work without this
+         * even though malloc failed
+         */
+        return rc;
+}
+static int
+generic_ip_connect(struct TCP_Server_Info *server)
 {
        int rc = 0;
-        int val;
+        unsigned short int sport;
-        bool connected = false;
+        int slen, sfamily;
-        __be16 orig_port = 0;
        struct socket *socket = server->ssocket;
+        struct sockaddr *saddr;
+        saddr = (struct sockaddr *) &server->dstaddr;
+        if (server->dstaddr.ss_family == AF_INET6) {
+                sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
+                slen = sizeof(struct sockaddr_in6);
+                sfamily = AF_INET6;
+        } else {
+                sport = ((struct sockaddr_in *) saddr)->sin_port;
+                slen = sizeof(struct sockaddr_in);
+                sfamily = AF_INET;
+        }
        if (socket == NULL) {
-                rc = sock_create_kern(PF_INET, SOCK_STREAM,
+                rc = sock_create_kern(sfamily, SOCK_STREAM,
                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
                        cERROR(1, "Error %d creating socket", rc);
+                        server->ssocket = NULL;
                        return rc;
                }
@@ -2156,63 +2275,28 @@ ipv4_connect(struct TCP_Server_Info *server)
                cFYI(1, "Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket4(socket);
+                if (sfamily == AF_INET6)
+                        cifs_reclassify_socket6(socket);
+                else
+                        cifs_reclassify_socket4(socket);
        }
        rc = bind_socket(server);
        if (rc < 0)
                return rc;
-        /* user overrode default port */
+        rc = socket->ops->connect(socket, saddr, slen, 0);
-        if (server->addr.sockAddr.sin_port) {
+        if (rc < 0) {
-                rc = socket->ops->connect(socket, (struct sockaddr *)
+                cFYI(1, "Error %d connecting to server", rc);
-                                          &server->addr.sockAddr,
-                                          sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr.sin_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr.sin_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket,
-                                                (struct sockaddr *)
-                                                &server->addr.sockAddr,
-                                                sizeof(struct sockaddr_in), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                              &server->addr.sockAddr,
-                                              sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        /* give up here - unless we want to retry on different
-                protocol families some day */
-        if (!connected) {
-                if (orig_port)
-                        server->addr.sockAddr.sin_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv4", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
        }
        /*
         * Eventually check for other socket options to change from
-         *  the default. sock_setsockopt not used because it expects
+         * the default. sock_setsockopt not used because it expects
-         *  user space buffer
+         * user space buffer
         */
        socket->sk->sk_rcvtimeo = 7 * HZ;
        socket->sk->sk_sndtimeo = 5 * HZ;
@@ -2226,7 +2310,7 @@ ipv4_connect(struct TCP_Server_Info *server)
        }
        if (server->tcp_nodelay) {
-                val = 1;
+                int val = 1;
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
@@ -2237,161 +2321,39 @@ ipv4_connect(struct TCP_Server_Info *server)
                 socket->sk->sk_sndbuf,
                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
-        /* send RFC1001 sessinit */
+        if (sport == htons(RFC1001_PORT))
-        if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
+                rc = ip_rfc1001_connect(server);
-                /* some servers require RFC1001 sessinit before sending
-                negprot - BB check reconnection in case where second
-                sessinit is sent but no second negprot */
-                struct rfc1002_session_packet *ses_init_buf;
-                struct smb_hdr *smb_buf;
-                ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
-                                       GFP_KERNEL);
-                if (ses_init_buf) {
-                        ses_init_buf->trailer.session_req.called_len = 32;
-                        if (server->server_RFC1001_name &&
-                            server->server_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              server->server_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              DEFAULT_CIFS_CALLED_NAME,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.calling_len = 32;
-                        /* calling name ends in null (byte 16) from old smb
-                        convention. */
-                        if (server->workstation_RFC1001_name &&
-                            server->workstation_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              server->workstation_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              "LINUX_CIFS_CLNT",
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.scope1 = 0;
-                        ses_init_buf->trailer.session_req.scope2 = 0;
-                        smb_buf = (struct smb_hdr *)ses_init_buf;
-                        /* sizeof RFC1002_SESSION_REQUEST with no scope */
-                        smb_buf->smb_buf_length = 0x81000044;
-                        rc = smb_send(server, smb_buf, 0x44);
-                        kfree(ses_init_buf);
-                        msleep(1); /* RFC1001 layer in at least one server
-                                      requires very short break before negprot
-                                      presumably because not expecting negprot
-                                      to follow so fast.  This is a simple
-                                      solution that works without
-                                      complicating the code and causes no
-                                      significant slowing down on mount
-                                      for everyone else */
-                }
-                /* else the negprot may still work without this
-                even though malloc failed */
-        }
        return rc;
 }
 static int
-ipv6_connect(struct TCP_Server_Info *server)
+ip_connect(struct TCP_Server_Info *server)
 {
-        int rc = 0;
+        unsigned short int *sport;
-        int val;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
-        bool connected = false;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
-        __be16 orig_port = 0;
-        struct socket *socket = server->ssocket;
-        if (socket == NULL) {
+        if (server->dstaddr.ss_family == AF_INET6)
-                rc = sock_create_kern(PF_INET6, SOCK_STREAM,
+                sport = &addr6->sin6_port;
-                                      IPPROTO_TCP, &socket);
+        else
-                if (rc < 0) {
+                sport = &addr->sin_port;
-                        cERROR(1, "Error %d creating ipv6 socket", rc);
-                        socket = NULL;
-                        return rc;
-                }
-                /* BB other socket options to set KEEPALIVE, NODELAY? */
+        if (*sport == 0) {
-                cFYI(1, "ipv6 Socket created");
+                int rc;
-                server->ssocket = socket;
-                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket6(socket);
-        }
-        rc = bind_socket(server);
+                /* try with 445 port at first */
-        if (rc < 0)
+                *sport = htons(CIFS_PORT);
-                return rc;
-        /* user overrode default port */
+                rc = generic_ip_connect(server);
-        if (server->addr.sockAddr6.sin6_port) {
-                rc = socket->ops->connect(socket,
-                                (struct sockaddr *) &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
                if (rc >= 0)
-                        connected = true;
+                        return rc;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr6.sin6_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket, (struct sockaddr *)
-                                        &server->addr.sockAddr6,
-                                        sizeof(struct sockaddr_in6), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        /* give up here - unless we want to retry on different
-                protocol families some day */
-        if (!connected) {
-                if (orig_port)
-                        server->addr.sockAddr6.sin6_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv6", rc);
-                sock_release(socket);
-                server->ssocket = NULL;
-                return rc;
-        }
-        /*
-         * Eventually check for other socket options to change from
-         * the default. sock_setsockopt not used because it expects
-         * user space buffer
-         */
-        socket->sk->sk_rcvtimeo = 7 * HZ;
-        socket->sk->sk_sndtimeo = 5 * HZ;
-        if (server->tcp_nodelay) {
+                /* if it failed, try with 139 port */
-                val = 1;
+                *sport = htons(RFC1001_PORT);
-                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
-                                (char *)&val, sizeof(val));
-                if (rc)
-                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
-        server->ssocket = socket;
+        return generic_ip_connect(server);
-        return rc;
 }
 void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
@@ -2970,8 +2932,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        TCONX_RSP *pSMBr;
        unsigned char *bcc_ptr;
        int rc = 0;
-        int length, bytes_left;
+        int length;
-        __u16 count;
+        __u16 bytes_left, count;
        if (ses == NULL)
                return -EIO;
@@ -2999,7 +2961,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                bcc_ptr++;              /* skip password */
                /* already aligned so no need to do it below */
        } else {
-                pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                /* BB FIXME add code to fail this if NTLMv2 or Kerberos
                   specified as required (when that support is added to
                   the vfs in the future) as only NTLM or the much
@@ -3017,7 +2979,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 #endif /* CIFS_WEAK_PW_HASH */
                SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
                        /* must align unicode strings */
                        *bcc_ptr = 0; /* null byte password */
@@ -3055,7 +3017,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        pSMB->ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
-                         CIFS_STD_OP);
+                         0);
        /* above now done in SendReceive */
        if ((rc == 0) && (tcon != NULL)) {
@@ -3065,7 +3027,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                tcon->need_reconnect = false;
                tcon->tid = smb_buffer_response->Tid;
                bcc_ptr = pByteArea(smb_buffer_response);
-                bytes_left = BCC(smb_buffer_response);
+                bytes_left = get_bcc(smb_buffer_response);
                length = strnlen(bcc_ptr, bytes_left - 2);
                if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
                        is_unicode = true;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index db2a58c00f7b..dd5f22918c33 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -130,17 +130,6 @@ cifs_bp_rename_retry:
        return full_path;
 }
-static void setup_cifs_dentry(struct cifsTconInfo *tcon,
-                              struct dentry *direntry,
-                              struct inode *newinode)
-{
-        if (tcon->nocase)
-                d_set_d_op(direntry, &cifs_ci_dentry_ops);
-        else
-                d_set_d_op(direntry, &cifs_dentry_ops);
-        d_instantiate(direntry, newinode);
-}
 /* Inode operations in similar order to how they appear in Linux file fs.h */
 int
@@ -293,10 +282,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
                }
-                CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
-                                        cifs_sb->local_nls,
+                                        current->tgid);
-                                        cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else {
                /* BB implement mode setting via Windows security
                   descriptors e.g. */
@@ -329,7 +316,7 @@ cifs_create_get_file_info:
 cifs_create_set_dentry:
        if (rc == 0)
-                setup_cifs_dentry(tcon, direntry, newinode);
+                d_instantiate(direntry, newinode);
        else
                cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
@@ -420,10 +407,6 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                rc = cifs_get_inode_info_unix(&newinode, full_path,
                                                inode->i_sb, xid);
-                if (pTcon->nocase)
-                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
-                else
-                        d_set_d_op(direntry, &cifs_dentry_ops);
                if (rc == 0)
                        d_instantiate(direntry, newinode);
@@ -603,10 +586,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                                parent_dir_inode->i_sb, xid, NULL);
        if ((rc == 0) && (newInode != NULL)) {
-                if (pTcon->nocase)
-                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
-                else
-                        d_set_d_op(direntry, &cifs_dentry_ops);
                d_add(direntry, newInode);
                if (posix_open) {
                        filp = lookup_instantiate_filp(nd, direntry,
@@ -633,10 +612,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        } else if (rc == -ENOENT) {
                rc = 0;
                direntry->d_time = jiffies;
-                if (pTcon->nocase)
-                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
-                else
-                        d_set_d_op(direntry, &cifs_dentry_ops);
                d_add(direntry, NULL);
        /*      if it was once a directory (but how can we tell?) we could do
                shrink_dcache_parent(direntry); */
@@ -700,6 +675,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 const struct dentry_operations cifs_dentry_ops = {
        .d_revalidate = cifs_d_revalidate,
+        .d_automount = cifs_dfs_d_automount,
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
@@ -736,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
        .d_revalidate = cifs_d_revalidate,
        .d_hash = cifs_ci_hash,
        .d_compare = cifs_ci_compare,
+        .d_automount = cifs_dfs_d_automount,
 };
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5a28660ca2b5..d7d65a70678e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -104,53 +104,6 @@ static inline int cifs_get_disposition(unsigned int flags)
                return FILE_OPEN;
 }
-static inline int cifs_open_inode_helper(struct inode *inode,
-        struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
-        char *full_path, int xid)
-{
-        struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
-        struct timespec temp;
-        int rc;
-        if (pCifsInode->clientCanCacheRead) {
-                /* we have the inode open somewhere else
-                   no need to discard cache data */
-                goto client_can_cache;
-        }
-        /* BB need same check in cifs_create too? */
-        /* if not oplocked, invalidate inode pages if mtime or file
-           size changed */
-        temp = cifs_NTtimeToUnix(buf->LastWriteTime);
-        if (timespec_equal(&inode->i_mtime, &temp) &&
-                           (inode->i_size ==
-                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, "inode unchanged on server");
-        } else {
-                if (inode->i_mapping) {
-                        /* BB no need to lock inode until after invalidate
-                        since namei code should already have it locked? */
-                        rc = filemap_write_and_wait(inode->i_mapping);
-                        mapping_set_error(inode->i_mapping, rc);
-                }
-                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed");
-                invalidate_remote_inode(inode);
-        }
-client_can_cache:
-        if (pTcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
-                                              xid);
-        else
-                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
-                                         xid, NULL);
-        cifs_set_oplock_level(pCifsInode, oplock);
-        return rc;
-}
 int cifs_posix_open(char *full_path, struct inode **pinode,
                        struct super_block *sb, int mode, unsigned int f_flags,
                        __u32 *poplock, __u16 *pnetfid, int xid)
@@ -213,6 +166,76 @@ posix_open_ret:
        return rc;
 }
+static int
+cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+             struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
+             __u16 *pnetfid, int xid)
+{
+        int rc;
+        int desiredAccess;
+        int disposition;
+        FILE_ALL_INFO *buf;
+        desiredAccess = cifs_convert_flags(f_flags);
+/*********************************************************************
+ *  open flag mapping table:
+ *
+ *      POSIX Flag            CIFS Disposition
+ *      ----------            ----------------
+ *      O_CREAT               FILE_OPEN_IF
+ *      O_CREAT | O_EXCL      FILE_CREATE
+ *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
+ *      O_TRUNC               FILE_OVERWRITE
+ *      none of the above     FILE_OPEN
+ *
+ *      Note that there is not a direct match between disposition
+ *      FILE_SUPERSEDE (ie create whether or not file exists although
+ *      O_CREAT | O_TRUNC is similar but truncates the existing
+ *      file rather than creating a new file as FILE_SUPERSEDE does
+ *      (which uses the attributes / metadata passed in on open call)
+ *?
+ *?  O_SYNC is a reasonable match to CIFS writethrough flag
+ *?  and the read write flags match reasonably.  O_LARGEFILE
+ *?  is irrelevant because largefile support is always used
+ *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
+ *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
+ *********************************************************************/
+        disposition = cifs_get_disposition(f_flags);
+        /* BB pass O_SYNC flag through on file attributes .. BB */
+        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        if (tcon->ses->capabilities & CAP_NT_SMBS)
+                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
+                         desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        else
+                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
+                        desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto out;
+        if (tcon->unix_ext)
+                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+                                              xid);
+        else
+                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
+                                         xid, pnetfid);
+out:
+        kfree(buf);
+        return rc;
+}
 struct cifsFileInfo *
 cifs_new_fileinfo(__u16 fileHandle, struct file *file,
                  struct tcon_link *tlink, __u32 oplock)
@@ -264,6 +287,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        struct inode *inode = cifs_file->dentry->d_inode;
        struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsLockInfo *li, *tmp;
        spin_lock(&cifs_file_list_lock);
@@ -279,6 +303,13 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        if (list_empty(&cifsi->openFileList)) {
                cFYI(1, "closing last open instance for inode %p",
                        cifs_file->dentry->d_inode);
+                /* in strict cache mode we need invalidate mapping on the last
+                   close  because it may cause a error when we open this file
+                   again and get at least level II oplock */
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+                        CIFS_I(inode)->invalid_mapping = true;
                cifs_set_oplock_level(cifsi, 0);
        }
        spin_unlock(&cifs_file_list_lock);
@@ -317,10 +348,8 @@ int cifs_open(struct inode *inode, struct file *file)
        struct cifsFileInfo *pCifsFile = NULL;
        struct cifsInodeInfo *pCifsInode;
        char *full_path = NULL;
-        int desiredAccess;
+        bool posix_open_ok = false;
-        int disposition;
        __u16 netfid;
-        FILE_ALL_INFO *buf = NULL;
        xid = GetXid();
@@ -358,17 +387,7 @@ int cifs_open(struct inode *inode, struct file *file)
                                file->f_flags, &oplock, &netfid, xid);
                if (rc == 0) {
                        cFYI(1, "posix open succeeded");
+                        posix_open_ok = true;
-                        pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
-                                                      oplock);
-                        if (pCifsFile == NULL) {
-                                CIFSSMBClose(xid, tcon, netfid);
-                                rc = -ENOMEM;
-                        }
-                        cifs_fscache_set_inode_cookie(inode, file);
-                        goto out;
                } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        if (tcon->ses->serverNOS)
                                cERROR(1, "server %s of type %s returned"
@@ -385,103 +404,39 @@ int cifs_open(struct inode *inode, struct file *file)
                   or DFS errors */
        }
-        desiredAccess = cifs_convert_flags(file->f_flags);
+        if (!posix_open_ok) {
+                rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
-/*********************************************************************
+                                  file->f_flags, &oplock, &netfid, xid);
- *  open flag mapping table:
+                if (rc)
- *
+                        goto out;
- *      POSIX Flag            CIFS Disposition
- *      ----------            ----------------
- *      O_CREAT               FILE_OPEN_IF
- *      O_CREAT | O_EXCL      FILE_CREATE
- *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
- *      O_TRUNC               FILE_OVERWRITE
- *      none of the above     FILE_OPEN
- *
- *      Note that there is not a direct match between disposition
- *      FILE_SUPERSEDE (ie create whether or not file exists although
- *      O_CREAT | O_TRUNC is similar but truncates the existing
- *      file rather than creating a new file as FILE_SUPERSEDE does
- *      (which uses the attributes / metadata passed in on open call)
- *?
- *?  O_SYNC is a reasonable match to CIFS writethrough flag
- *?  and the read write flags match reasonably.  O_LARGEFILE
- *?  is irrelevant because largefile support is always used
- *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
- *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
- *********************************************************************/
-        disposition = cifs_get_disposition(file->f_flags);
-        /* BB pass O_SYNC flag through on file attributes .. BB */
-        /* Also refresh inode by passing in file_info buf returned by SMBOpen
-           and calling get_inode_info with returned buf (at least helps
-           non-Unix server case) */
-        /* BB we can not do this if this is the second open of a file
-           and the first handle has writebehind data, we might be
-           able to simply do a filemap_fdatawrite/filemap_fdatawait first */
-        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-        if (!buf) {
-                rc = -ENOMEM;
-                goto out;
-        }
-        if (tcon->ses->capabilities & CAP_NT_SMBS)
-                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-                         desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        else
-                rc = -EIO; /* no NT SMB support fall into legacy open below */
-        if (rc == -EIO) {
-                /* Old server, try legacy style OpenX */
-                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
-                        desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        }
-        if (rc) {
-                cFYI(1, "cifs_open returned 0x%x", rc);
-                goto out;
        }
-        rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
-        if (rc != 0)
-                goto out;
        pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
        if (pCifsFile == NULL) {
+                CIFSSMBClose(xid, tcon, netfid);
                rc = -ENOMEM;
                goto out;
        }
        cifs_fscache_set_inode_cookie(inode, file);
-        if (oplock & CIFS_CREATE_ACTION) {
+        if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
                /* time to set mode which we can not set earlier due to
                   problems creating new read-only files */
-                if (tcon->unix_ext) {
+                struct cifs_unix_set_info_args args = {
-                        struct cifs_unix_set_info_args args = {
+                        .mode   = inode->i_mode,
-                                .mode   = inode->i_mode,
+                        .uid    = NO_CHANGE_64,
-                                .uid    = NO_CHANGE_64,
+                        .gid    = NO_CHANGE_64,
-                                .gid    = NO_CHANGE_64,
+                        .ctime  = NO_CHANGE_64,
-                                .ctime  = NO_CHANGE_64,
+                        .atime  = NO_CHANGE_64,
-                                .atime  = NO_CHANGE_64,
+                        .mtime  = NO_CHANGE_64,
-                                .mtime  = NO_CHANGE_64,
+                        .device = 0,
-                                .device = 0,
+                };
-                        };
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
-                        CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                                        pCifsFile->pid);
-                                               cifs_sb->local_nls,
-                                               cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                }
        }
 out:
-        kfree(buf);
        kfree(full_path);
        FreeXid(xid);
        cifs_put_tlink(tlink);
@@ -779,12 +734,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                /* BB we could chain these into one lock request BB */
                rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
-                                 0, 1, lockType, 0 /* wait flag */ );
+                                 0, 1, lockType, 0 /* wait flag */, 0);
                if (rc == 0) {
                        rc = CIFSSMBLock(xid, tcon, netfid, length,
                                         pfLock->fl_start, 1 /* numUnlock */ ,
                                         0 /* numLock */ , lockType,
-                                         0 /* wait flag */ );
+                                         0 /* wait flag */, 0);
                        pfLock->fl_type = F_UNLCK;
                        if (rc != 0)
                                cERROR(1, "Error unlocking previously locked "
@@ -801,13 +756,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                rc = CIFSSMBLock(xid, tcon, netfid, length,
                                        pfLock->fl_start, 0, 1,
                                        lockType | LOCKING_ANDX_SHARED_LOCK,
-                                        0 /* wait flag */);
+                                        0 /* wait flag */, 0);
                                if (rc == 0) {
                                        rc = CIFSSMBLock(xid, tcon, netfid,
                                                length, pfLock->fl_start, 1, 0,
                                                lockType |
                                                LOCKING_ANDX_SHARED_LOCK,
-                                                0 /* wait flag */);
+                                                0 /* wait flag */, 0);
                                        pfLock->fl_type = F_RDLCK;
                                        if (rc != 0)
                                                cERROR(1, "Error unlocking "
@@ -850,8 +805,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                if (numLock) {
                        rc = CIFSSMBLock(xid, tcon, netfid, length,
-                                        pfLock->fl_start,
+                                         pfLock->fl_start, 0, numLock, lockType,
-                                        0, numLock, lockType, wait_flag);
+                                         wait_flag, 0);
                        if (rc == 0) {
                                /* For Windows locks we must store them. */
@@ -871,9 +826,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                                (pfLock->fl_start + length) >=
                                                (li->offset + li->length)) {
                                        stored_rc = CIFSSMBLock(xid, tcon,
-                                                        netfid,
+                                                        netfid, li->length,
-                                                        li->length, li->offset,
+                                                        li->offset, 1, 0,
-                                                        1, 0, li->type, false);
+                                                        li->type, false, 0);
                                        if (stored_rc)
                                                rc = stored_rc;
                                        else {
@@ -892,29 +847,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        return rc;
 }
-/*
- * Set the timeout on write requests past EOF. For some servers (Windows)
- * these calls can be very long.
- *
- * If we're writing >10M past the EOF we give a 180s timeout. Anything less
- * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
- * The 10M cutoff is totally arbitrary. A better scheme for this would be
- * welcome if someone wants to suggest one.
- *
- * We may be able to do a better job with this if there were some way to
- * declare that a file should be sparse.
- */
-static int
-cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
-{
-        if (offset <= cifsi->server_eof)
-                return CIFS_STD_OP;
-        else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
-                return CIFS_VLONG_OP;
-        else
-                return CIFS_LONG_OP;
-}
 /* update the file size (if needed) after a write */
 static void
 cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
@@ -935,7 +867,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        unsigned int total_written;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        int xid, long_op;
+        int xid;
        struct cifsFileInfo *open_file;
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
@@ -956,7 +888,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        xid = GetXid();
-        long_op = cifs_write_timeout(cifsi, *poffset);
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
@@ -984,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                                min_t(const int, cifs_sb->wsize,
                                      write_size - total_written),
                                *poffset, &bytes_written,
-                                NULL, write_data + total_written, long_op);
+                                NULL, write_data + total_written, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -997,8 +928,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
                }
-                long_op = CIFS_STD_OP; /* subsequent writes fast -
-                                    15 seconds is plenty */
        }
        cifs_stats_bytes_written(pTcon, total_written);
@@ -1027,7 +956,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
        unsigned int total_written;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        int xid, long_op;
+        int xid;
        struct dentry *dentry = open_file->dentry;
        struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
@@ -1040,7 +969,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
        xid = GetXid();
-        long_op = cifs_write_timeout(cifsi, *poffset);
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
@@ -1070,7 +998,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                                rc = CIFSSMBWrite2(xid, pTcon,
                                                open_file->netfid, len,
                                                *poffset, &bytes_written,
-                                                iov, 1, long_op);
+                                                iov, 1, 0);
                        } else
                                rc = CIFSSMBWrite(xid, pTcon,
                                         open_file->netfid,
@@ -1078,7 +1006,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                                               write_size - total_written),
                                         *poffset, &bytes_written,
                                         write_data + total_written,
-                                         NULL, long_op);
+                                         NULL, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -1091,8 +1019,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
                }
-                long_op = CIFS_STD_OP; /* subsequent writes fast -
-                                    15 seconds is plenty */
        }
        cifs_stats_bytes_written(pTcon, total_written);
@@ -1292,7 +1218,7 @@ static int cifs_writepages(struct address_space *mapping,
        struct pagevec pvec;
        int rc = 0;
        int scanned = 0;
-        int xid, long_op;
+        int xid;
        cifs_sb = CIFS_SB(mapping->host->i_sb);
@@ -1430,43 +1356,67 @@ retry:
                                break;
                }
                if (n_iov) {
+retry_write:
                        open_file = find_writable_file(CIFS_I(mapping->host),
                                                        false);
                        if (!open_file) {
                                cERROR(1, "No writable handles for inode");
                                rc = -EBADF;
                        } else {
-                                long_op = cifs_write_timeout(cifsi, offset);
                                rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
                                                   bytes_to_write, offset,
                                                   &bytes_written, iov, n_iov,
-                                                   long_op);
+                                                   0);
                                cifsFileInfo_put(open_file);
-                                cifs_update_eof(cifsi, offset, bytes_written);
                        }
-                        if (rc || bytes_written < bytes_to_write) {
+                        cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
-                                cERROR(1, "Write2 ret %d, wrote %d",
-                                          rc, bytes_written);
+                        /*
-                                mapping_set_error(mapping, rc);
+                         * For now, treat a short write as if nothing got
-                        } else {
+                         * written. A zero length write however indicates
+                         * ENOSPC or EFBIG. We have no way to know which
+                         * though, so call it ENOSPC for now. EFBIG would
+                         * get translated to AS_EIO anyway.
+                         *
+                         * FIXME: make it take into account the data that did
+                         *        get written
+                         */
+                        if (rc == 0) {
+                                if (bytes_written == 0)
+                                        rc = -ENOSPC;
+                                else if (bytes_written < bytes_to_write)
+                                        rc = -EAGAIN;
+                        }
+                        /* retry on data-integrity flush */
+                        if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
+                                goto retry_write;
+                        /* fix the stats and EOF */
+                        if (bytes_written > 0) {
                                cifs_stats_bytes_written(tcon, bytes_written);
+                                cifs_update_eof(cifsi, offset, bytes_written);
                        }
                        for (i = 0; i < n_iov; i++) {
                                page = pvec.pages[first + i];
-                                /* Should we also set page error on
+                                /* on retryable write error, redirty page */
-                                success rc but too little data written? */
+                                if (rc == -EAGAIN)
-                                /* BB investigate retry logic on temporary
+                                        redirty_page_for_writepage(wbc, page);
-                                server crash cases and how recovery works
+                                else if (rc != 0)
-                                when page marked as error */
-                                if (rc)
                                        SetPageError(page);
                                kunmap(page);
                                unlock_page(page);
                                end_page_writeback(page);
                                page_cache_release(page);
                        }
+                        if (rc != -EAGAIN)
+                                mapping_set_error(mapping, rc);
+                        else
+                                rc = 0;
                        if ((wbc->nr_to_write -= n_iov) <= 0)
                                done = 1;
                        index = next;
@@ -1578,27 +1528,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        return rc;
 }
-int cifs_fsync(struct file *file, int datasync)
+int cifs_strict_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
        struct cifsTconInfo *tcon;
        struct cifsFileInfo *smbfile = file->private_data;
        struct inode *inode = file->f_path.dentry->d_inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        xid = GetXid();
        cFYI(1, "Sync file - name: %s datasync: 0x%x",
                file->f_path.dentry->d_name.name, datasync);
-        rc = filemap_write_and_wait(inode->i_mapping);
+        if (!CIFS_I(inode)->clientCanCacheRead)
-        if (rc == 0) {
+                cifs_invalidate_mapping(inode);
-                struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-                tcon = tlink_tcon(smbfile->tlink);
+        tcon = tlink_tcon(smbfile->tlink);
-                if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-                        rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
+                rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
-        }
+        FreeXid(xid);
+        return rc;
+}
+int cifs_fsync(struct file *file, int datasync)
+{
+        int xid;
+        int rc = 0;
+        struct cifsTconInfo *tcon;
+        struct cifsFileInfo *smbfile = file->private_data;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        xid = GetXid();
+        cFYI(1, "Sync file - name: %s datasync: 0x%x",
+                file->f_path.dentry->d_name.name, datasync);
+        tcon = tlink_tcon(smbfile->tlink);
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+                rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
        FreeXid(xid);
        return rc;
@@ -1649,42 +1619,42 @@ int cifs_flush(struct file *file, fl_owner_t id)
        return rc;
 }
-ssize_t cifs_user_read(struct file *file, char __user *read_data,
+static ssize_t
-        size_t read_size, loff_t *poffset)
+cifs_iovec_read(struct file *file, const struct iovec *iov,
+                 unsigned long nr_segs, loff_t *poffset)
 {
-        int rc = -EACCES;
+        int rc;
-        unsigned int bytes_read = 0;
+        int xid;
-        unsigned int total_read = 0;
+        unsigned int total_read, bytes_read = 0;
-        unsigned int current_read_size;
+        size_t len, cur_len;
+        int iov_offset = 0;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        int xid;
        struct cifsFileInfo *open_file;
-        char *smb_read_data;
-        char __user *current_offset;
        struct smb_com_read_rsp *pSMBr;
+        char *read_data;
+        if (!nr_segs)
+                return 0;
+        len = iov_length(iov, nr_segs);
+        if (!len)
+                return 0;
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        if (file->private_data == NULL) {
-                rc = -EBADF;
-                FreeXid(xid);
-                return rc;
-        }
        open_file = file->private_data;
        pTcon = tlink_tcon(open_file->tlink);
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
-        for (total_read = 0, current_offset = read_data;
+        for (total_read = 0; total_read < len; total_read += bytes_read) {
-             read_size > total_read;
+                cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
-             total_read += bytes_read, current_offset += bytes_read) {
-                current_read_size = min_t(const int, read_size - total_read,
-                                          cifs_sb->rsize);
                rc = -EAGAIN;
-                smb_read_data = NULL;
+                read_data = NULL;
                while (rc == -EAGAIN) {
                        int buf_type = CIFS_NO_BUFFER;
                        if (open_file->invalidHandle) {
@@ -1692,27 +1662,25 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                                if (rc != 0)
                                        break;
                        }
-                        rc = CIFSSMBRead(xid, pTcon,
+                        rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
-                                         open_file->netfid,
+                                         cur_len, *poffset, &bytes_read,
-                                         current_read_size, *poffset,
+                                         &read_data, &buf_type);
-                                         &bytes_read, &smb_read_data,
+                        pSMBr = (struct smb_com_read_rsp *)read_data;
-                                         &buf_type);
+                        if (read_data) {
-                        pSMBr = (struct smb_com_read_rsp *)smb_read_data;
+                                char *data_offset = read_data + 4 +
-                        if (smb_read_data) {
+                                                le16_to_cpu(pSMBr->DataOffset);
-                                if (copy_to_user(current_offset,
+                                if (memcpy_toiovecend(iov, data_offset,
-                                                smb_read_data +
+                                                      iov_offset, bytes_read))
-                                                4 /* RFC1001 length field */ +
-                                                le16_to_cpu(pSMBr->DataOffset),
-                                                bytes_read))
                                        rc = -EFAULT;
                                if (buf_type == CIFS_SMALL_BUFFER)
-                                        cifs_small_buf_release(smb_read_data);
+                                        cifs_small_buf_release(read_data);
                                else if (buf_type == CIFS_LARGE_BUFFER)
-                                        cifs_buf_release(smb_read_data);
+                                        cifs_buf_release(read_data);
-                                smb_read_data = NULL;
+                                read_data = NULL;
+                                iov_offset += bytes_read;
                        }
                }
                if (rc || (bytes_read == 0)) {
                        if (total_read) {
                                break;
@@ -1725,13 +1693,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                        *poffset += bytes_read;
                }
        }
        FreeXid(xid);
        return total_read;
 }
+ssize_t cifs_user_read(struct file *file, char __user *read_data,
+                       size_t read_size, loff_t *poffset)
+{
+        struct iovec iov;
+        iov.iov_base = read_data;
+        iov.iov_len = read_size;
+        return cifs_iovec_read(file, &iov, 1, poffset);
+}
+static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t pos)
+{
+        ssize_t read;
+        read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
+        if (read > 0)
+                iocb->ki_pos = pos;
+        return read;
+}
+ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
+                          unsigned long nr_segs, loff_t pos)
+{
+        struct inode *inode;
+        inode = iocb->ki_filp->f_path.dentry->d_inode;
+        if (CIFS_I(inode)->clientCanCacheRead)
+                return generic_file_aio_read(iocb, iov, nr_segs, pos);
+        /*
+         * In strict cache mode we need to read from the server all the time
+         * if we don't have level II oplock because the server can delay mtime
+         * change - so we can't make a decision about inode invalidating.
+         * And we can also fail with pagereading if there are mandatory locks
+         * on pages affected by this read but not on the region from pos to
+         * pos+len-1.
+         */
+        return cifs_user_readv(iocb, iov, nr_segs, pos);
+}
 static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
-        loff_t *poffset)
+                         loff_t *poffset)
 {
        int rc = -EACCES;
        unsigned int bytes_read = 0;
@@ -1799,6 +1811,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        return total_read;
 }
+int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int rc, xid;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        xid = GetXid();
+        if (!CIFS_I(inode)->clientCanCacheRead)
+                cifs_invalidate_mapping(inode);
+        rc = generic_file_mmap(file, vma);
+        FreeXid(xid);
+        return rc;
+}
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int rc, xid;
@@ -2245,7 +2272,8 @@ void cifs_oplock_break(struct work_struct *work)
         */
        if (!cfile->oplock_break_cancelled) {
                rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
-                                 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
+                                 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+                                 cinode->clientCanCacheRead ? 1 : 0);
                cFYI(1, "Oplock release rc = %d", rc);
        }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a853a89857a5..8852470b4fbb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
 #include "fscache.h"
-static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
+static void cifs_set_ops(struct inode *inode)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -44,13 +44,17 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
                                inode->i_fop = &cifs_file_direct_nobrl_ops;
                        else
                                inode->i_fop = &cifs_file_direct_ops;
+                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                                inode->i_fop = &cifs_file_strict_nobrl_ops;
+                        else
+                                inode->i_fop = &cifs_file_strict_ops;
                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
                        inode->i_fop = &cifs_file_nobrl_ops;
                else { /* not direct, send byte range locks */
                        inode->i_fop = &cifs_file_ops;
                }
                /* check if server can support readpages */
                if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
                                PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
@@ -60,7 +64,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
                break;
        case S_IFDIR:
 #ifdef CONFIG_CIFS_DFS_UPCALL
-                if (is_dfs_referral) {
+                if (IS_AUTOMOUNT(inode)) {
                        inode->i_op = &cifs_dfs_referral_inode_operations;
                } else {
 #else /* NO DFS support, treat as a directory */
@@ -167,7 +171,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        }
        spin_unlock(&inode->i_lock);
-        cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
+        if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
+                inode->i_flags |= S_AUTOMOUNT;
+        cifs_set_ops(inode);
 }
 void
@@ -518,6 +524,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
@@ -779,6 +786,10 @@ cifs_find_inode(struct inode *inode, void *opaque)
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
+        /* use createtime like an i_generation field */
+        if (CIFS_I(inode)->createtime != fattr->cf_createtime)
+                return 0;
        /* don't match inode of different type */
        if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
                return 0;
@@ -796,6 +807,7 @@ cifs_init_inode(struct inode *inode, void *opaque)
        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
        CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+        CIFS_I(inode)->createtime = fattr->cf_createtime;
        return 0;
 }
@@ -1318,10 +1330,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 /*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
        to set uid/gid */
                        inc_nlink(inode);
-                        if (pTcon->nocase)
-                                d_set_d_op(direntry, &cifs_ci_dentry_ops);
-                        else
-                                d_set_d_op(direntry, &cifs_dentry_ops);
                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
                        cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1362,10 +1370,6 @@ mkdir_get_info:
                        rc = cifs_get_inode_info(&newinode, full_path, NULL,
                                                 inode->i_sb, xid, NULL);
-                if (pTcon->nocase)
-                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
-                else
-                        d_set_d_op(direntry, &cifs_dentry_ops);
                d_instantiate(direntry, newinode);
                 /* setting nlink not necessary except in cases where we
                  * failed to get it from the server or was set bogus */
@@ -1679,7 +1683,7 @@ cifs_inode_needs_reval(struct inode *inode)
 /*
 * Zap the cache. Called when invalid_mapping flag is set.
 */
-static void
+void
 cifs_invalidate_mapping(struct inode *inode)
 {
        int rc;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fe2f6a93c49e..306769de2fb5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -524,10 +524,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                        cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
                              rc);
                } else {
-                        if (pTcon->nocase)
-                                d_set_d_op(direntry, &cifs_ci_dentry_ops);
-                        else
-                                d_set_d_op(direntry, &cifs_dentry_ops);
                        d_instantiate(direntry, newinode);
                }
        }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 43f10281bc19..a09e077ba925 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -571,7 +571,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                pCifsInode = CIFS_I(netfile->dentry->d_inode);
                                cifs_set_oplock_level(pCifsInode,
-                                                      pSMB->OplockLevel);
+                                        pSMB->OplockLevel ? OPLOCK_READ : 0);
                                /*
                                 * cifs_oplock_break_put() can't be called
                                 * from here.  Get reference after queueing
@@ -637,77 +637,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
        return;
 }
-/* Convert 16 bit Unicode pathname to wire format from string in current code
-   page.  Conversion may involve remapping up the seven characters that are
-   only legal in POSIX-like OS (if they are present in the string). Path
-   names are little endian 16 bit Unicode on the wire */
-int
-cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-                 const struct nls_table *cp, int mapChars)
-{
-        int i, j, charlen;
-        int len_remaining = maxlen;
-        char src_char;
-        __u16 temp;
-        if (!mapChars)
-                return cifs_strtoUCS(target, source, PATH_MAX, cp);
-        for (i = 0, j = 0; i < maxlen; j++) {
-                src_char = source[i];
-                switch (src_char) {
-                        case 0:
-                                target[j] = 0;
-                                goto ctoUCS_out;
-                        case ':':
-                                target[j] = cpu_to_le16(UNI_COLON);
-                                break;
-                        case '*':
-                                target[j] = cpu_to_le16(UNI_ASTERIK);
-                                break;
-                        case '?':
-                                target[j] = cpu_to_le16(UNI_QUESTION);
-                                break;
-                        case '<':
-                                target[j] = cpu_to_le16(UNI_LESSTHAN);
-                                break;
-                        case '>':
-                                target[j] = cpu_to_le16(UNI_GRTRTHAN);
-                                break;
-                        case '|':
-                                target[j] = cpu_to_le16(UNI_PIPE);
-                                break;
-                        /* BB We can not handle remapping slash until
-                           all the calls to build_path_from_dentry
-                           are modified, as they use slash as separator BB */
-                        /* case '\\':
-                                target[j] = cpu_to_le16(UNI_SLASH);
-                                break;*/
-                        default:
-                                charlen = cp->char2uni(source+i,
-                                        len_remaining, &temp);
-                                /* if no match, use question mark, which
-                                at least in some cases servers as wild card */
-                                if (charlen < 1) {
-                                        target[j] = cpu_to_le16(0x003f);
-                                        charlen = 1;
-                                } else
-                                        target[j] = cpu_to_le16(temp);
-                                len_remaining -= charlen;
-                                /* character may take more than one byte in the
-                                   the source string, but will take exactly two
-                                   bytes in the target string */
-                                i += charlen;
-                                continue;
-                }
-                i++; /* move to next char in source string */
-                len_remaining--;
-        }
-ctoUCS_out:
-        return i;
-}
 void
 cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62f..8d9189f64477 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
        }
        /* else ERRHRD class errors or junk  - return EIO */
-        cFYI(1, "Mapping smb error code %d to POSIX err %d",
+        cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
-                 smberrcode, rc);
+                 le32_to_cpu(smb->Status.CifsError), rc);
        /* generic corrective action e.g. reconnect SMB session on
         * ERRbaduid could be added */
@@ -916,14 +916,14 @@ unsigned int
 smbCalcSize(struct smb_hdr *ptr)
 {
        return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-                2 /* size of the bcc field */ + BCC(ptr));
+                2 /* size of the bcc field */ + get_bcc(ptr));
 }
 unsigned int
 smbCalcSize_LE(struct smb_hdr *ptr)
 {
        return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-                2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr)));
+                2 /* size of the bcc field */ + get_bcc_le(ptr));
 }
 /* The following are taken from fs/ntfs/util.c */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ec5b68e3b928..7f25cc3d2256 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,11 +102,6 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
                return NULL;
        }
-        if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
-                d_set_d_op(dentry, &cifs_ci_dentry_ops);
-        else
-                d_set_d_op(dentry, &cifs_dentry_ops);
        alias = d_materialise_unique(dentry, inode);
        if (alias != NULL) {
                dput(dentry);
@@ -160,6 +155,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
        fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
        fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7b01d3f6eed6..1adc9625a344 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -277,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 }
 static void
-decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
+decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
                      const struct nls_table *nls_cp)
 {
        int len;
@@ -323,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        return;
 }
-static int decode_ascii_ssetup(char **pbcc_area, int bleft,
+static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
                               struct cifsSesInfo *ses,
                               const struct nls_table *nls_cp)
 {
@@ -420,7 +420,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        return 0;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 /* BB Move to ntlmssp.c eventually */
 /* We do not malloc the blob, it is passed in pbuffer, because
@@ -431,13 +430,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
        __u32 flags;
+        memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
        sec_blob->MessageType = NtLmNegotiate;
        /* BB is NTLMV2 session security format easier to use here? */
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -446,7 +446,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
                                NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        }
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->WorkstationName.BufferOffset = 0;
        sec_blob->WorkstationName.Length = 0;
@@ -477,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        flags = NTLMSSP_NEGOTIATE_56 |
                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -485,7 +485,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
        tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->LmChallengeResponse.BufferOffset =
                                cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
@@ -544,8 +544,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->WorkstationName.MaximumLength = 0;
        tmp += 2;
-        if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+        if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
-                        !calc_seckey(ses)) {
+                (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
+                        && !calc_seckey(ses)) {
                memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
@@ -563,17 +564,6 @@ setup_ntlmv2_ret:
        return rc;
 }
-static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
-                                 struct cifsSesInfo *ses)
-{
-        build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
-        return;
-}
-#endif
 int
 CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
               const struct nls_table *nls_cp)
@@ -585,12 +575,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
        char *str_area;
        SESSION_SETUP_ANDX *pSMB;
        __u32 capabilities;
-        int count;
+        __u16 count;
        int resp_buf_type;
        struct kvec iov[3];
        enum securityEnum type;
-        __u16 action;
+        __u16 action, bytes_remaining;
-        int bytes_remaining;
        struct key *spnego_key = NULL;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
        u16 blob_len;
@@ -814,71 +803,70 @@ ssetup_ntlmssp_authenticate:
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
-        } else {
+        } else if (type == RawNTLMSSP) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-                if (type == RawNTLMSSP) {
+                        cERROR(1, "NTLMSSP requires Unicode support");
-                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+                        rc = -ENOSYS;
-                                cERROR(1, "NTLMSSP requires Unicode support");
+                        goto ssetup_exit;
-                                rc = -ENOSYS;
+                }
+                cFYI(1, "ntlmssp session setup phase %d", phase);
+                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                capabilities |= CAP_EXTENDED_SECURITY;
+                pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+                switch(phase) {
+                case NtLmNegotiate:
+                        build_ntlmssp_negotiate_blob(
+                                pSMB->req.SecurityBlob, ses);
+                        iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+                        iov[1].iov_base = pSMB->req.SecurityBlob;
+                        pSMB->req.SecurityBlobLength =
+                                cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+                        break;
+                case NtLmAuthenticate:
+                        /*
+                         * 5 is an empirical value, large enough to hold
+                         * authenticate message plus max 10 of av paris,
+                         * domain, user, workstation names, flags, etc.
+                         */
+                        ntlmsspblob = kzalloc(
+                                5*sizeof(struct _AUTHENTICATE_MESSAGE),
+                                GFP_KERNEL);
+                        if (!ntlmsspblob) {
+                                cERROR(1, "Can't allocate NTLMSSP blob");
+                                rc = -ENOMEM;
                                goto ssetup_exit;
                        }
-                        cFYI(1, "ntlmssp session setup phase %d", phase);
+                        rc = build_ntlmssp_auth_blob(ntlmsspblob,
-                        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                                                &blob_len, ses, nls_cp);
-                        capabilities |= CAP_EXTENDED_SECURITY;
+                        if (rc)
-                        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
-                        if (phase == NtLmNegotiate) {
-                                setup_ntlmssp_neg_req(pSMB, ses);
-                                iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
-                                iov[1].iov_base = &pSMB->req.SecurityBlob[0];
-                        } else if (phase == NtLmAuthenticate) {
-                                /* 5 is an empirical value, large enought to
-                                 * hold authenticate message, max 10 of
-                                 * av paris, doamin,user,workstation mames,
-                                 * flags etc..
-                                 */
-                                ntlmsspblob = kmalloc(
-                                        5*sizeof(struct _AUTHENTICATE_MESSAGE),
-                                        GFP_KERNEL);
-                                if (!ntlmsspblob) {
-                                        cERROR(1, "Can't allocate NTLMSSP");
-                                        rc = -ENOMEM;
-                                        goto ssetup_exit;
-                                }
-                                rc = build_ntlmssp_auth_blob(ntlmsspblob,
-                                                        &blob_len, ses, nls_cp);
-                                if (rc)
-                                        goto ssetup_exit;
-                                iov[1].iov_len = blob_len;
-                                iov[1].iov_base = ntlmsspblob;
-                                pSMB->req.SecurityBlobLength =
-                                        cpu_to_le16(blob_len);
-                                /* Make sure that we tell the server that we
-                                   are using the uid that it just gave us back
-                                   on the response (challenge) */
-                                smb_buf->Uid = ses->Suid;
-                        } else {
-                                cERROR(1, "invalid phase %d", phase);
-                                rc = -ENOSYS;
                                goto ssetup_exit;
-                        }
+                        iov[1].iov_len = blob_len;
-                        /* unicode strings must be word aligned */
+                        iov[1].iov_base = ntlmsspblob;
-                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
-                                *bcc_ptr = 0;
+                        /*
-                                bcc_ptr++;
+                         * Make sure that we tell the server that we are using
-                        }
+                         * the uid that it just gave us back on the response
-                        unicode_oslm_strings(&bcc_ptr, nls_cp);
+                         * (challenge)
-                } else {
+                         */
-                        cERROR(1, "secType %d not supported!", type);
+                        smb_buf->Uid = ses->Suid;
+                        break;
+                default:
+                        cERROR(1, "invalid phase %d", phase);
                        rc = -ENOSYS;
                        goto ssetup_exit;
                }
-#else
+                /* unicode strings must be word aligned */
+                if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        *bcc_ptr = 0;
+                        bcc_ptr++;
+                }
+                unicode_oslm_strings(&bcc_ptr, nls_cp);
+        } else {
                cERROR(1, "secType %d not supported!", type);
                rc = -ENOSYS;
                goto ssetup_exit;
-#endif
        }
        iov[2].iov_base = str_area;
@@ -887,10 +875,10 @@ ssetup_ntlmssp_authenticate:
        count = iov[1].iov_len + iov[2].iov_len;
        smb_buf->smb_buf_length += count;
-        BCC_LE(smb_buf) = cpu_to_le16(count);
+        put_bcc_le(count, smb_buf);
        rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
-                          CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
+                          CIFS_LOG_ERROR);
        /* SMB request buf freed in SendReceive2 */
        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
@@ -921,7 +909,7 @@ ssetup_ntlmssp_authenticate:
        cFYI(1, "UID = %d ", ses->Suid);
        /* response can have either 3 or 4 word count - Samba sends 3 */
        /* and lanman response is 3 */
-        bytes_remaining = BCC(smb_buf);
+        bytes_remaining = get_bcc(smb_buf);
        bcc_ptr = pByteArea(smb_buf);
        if (smb_buf->WordCount == 4) {
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e0588cdf4cc5..c1ccca1a933f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,7 +36,13 @@
 extern mempool_t *cifs_mid_poolp;
-static struct mid_q_entry *
+static void
+wake_up_task(struct mid_q_entry *mid)
+{
+        wake_up_process(mid->callback_data);
+}
+struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 {
        struct mid_q_entry *temp;
@@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
        /*      do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
                /* when mid allocated can be before when sent */
                temp->when_alloc = jiffies;
-                temp->tsk = current;
+                /*
+                 * The default is for the mid to be synchronous, so the
+                 * default callback just wakes up the current task.
+                 */
+                temp->callback = wake_up_task;
+                temp->callback_data = current;
        }
-        spin_lock(&GlobalMid_Lock);
-        list_add_tail(&temp->qhead, &server->pending_mid_q);
        atomic_inc(&midCount);
        temp->midState = MID_REQUEST_ALLOCATED;
-        spin_unlock(&GlobalMid_Lock);
        return temp;
 }
-static void
+void
 DeleteMidQEntry(struct mid_q_entry *midEntry)
 {
 #ifdef CONFIG_CIFS_STATS2
        unsigned long now;
 #endif
-        spin_lock(&GlobalMid_Lock);
        midEntry->midState = MID_FREE;
-        list_del(&midEntry->qhead);
        atomic_dec(&midCount);
-        spin_unlock(&GlobalMid_Lock);
        if (midEntry->largeBuf)
                cifs_buf_release(midEntry->resp_buf);
        else
@@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
        mempool_free(midEntry, cifs_mid_poolp);
 }
+static void
+delete_mid(struct mid_q_entry *mid)
+{
+        spin_lock(&GlobalMid_Lock);
+        list_del(&mid->qhead);
+        spin_unlock(&GlobalMid_Lock);
+        DeleteMidQEntry(mid);
+}
 static int
 smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 {
@@ -119,7 +135,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        if (ssocket == NULL)
                return -ENOTSOCK; /* BB eventually add reconnect code here */
-        smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
+        smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
        smb_msg.msg_namelen = sizeof(struct sockaddr);
        smb_msg.msg_control = NULL;
        smb_msg.msg_controllen = 0;
@@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
        return smb_sendv(server, &iov, 1);
 }
-static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
+static int wait_for_free_request(struct TCP_Server_Info *server,
+                                 const int long_op)
 {
        if (long_op == CIFS_ASYNC_OP) {
                /* oplock breaks must not be held up */
-                atomic_inc(&ses->server->inFlight);
+                atomic_inc(&server->inFlight);
                return 0;
        }
        spin_lock(&GlobalMid_Lock);
        while (1) {
-                if (atomic_read(&ses->server->inFlight) >=
+                if (atomic_read(&server->inFlight) >= cifs_max_pending) {
-                                cifs_max_pending){
                        spin_unlock(&GlobalMid_Lock);
 #ifdef CONFIG_CIFS_STATS2
-                        atomic_inc(&ses->server->num_waiters);
+                        atomic_inc(&server->num_waiters);
 #endif
-                        wait_event(ses->server->request_q,
+                        wait_event(server->request_q,
-                                   atomic_read(&ses->server->inFlight)
+                                   atomic_read(&server->inFlight)
                                     < cifs_max_pending);
 #ifdef CONFIG_CIFS_STATS2
-                        atomic_dec(&ses->server->num_waiters);
+                        atomic_dec(&server->num_waiters);
 #endif
                        spin_lock(&GlobalMid_Lock);
                } else {
-                        if (ses->server->tcpStatus == CifsExiting) {
+                        if (server->tcpStatus == CifsExiting) {
                                spin_unlock(&GlobalMid_Lock);
                                return -ENOENT;
                        }
@@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
                        /* update # of requests on the wire to server */
                        if (long_op != CIFS_BLOCKING_OP)
-                                atomic_inc(&ses->server->inFlight);
+                                atomic_inc(&server->inFlight);
                        spin_unlock(&GlobalMid_Lock);
                        break;
                }
@@ -308,53 +324,81 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
        *ppmidQ = AllocMidQEntry(in_buf, ses->server);
        if (*ppmidQ == NULL)
                return -ENOMEM;
+        spin_lock(&GlobalMid_Lock);
+        list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
+        spin_unlock(&GlobalMid_Lock);
        return 0;
 }
-static int wait_for_response(struct cifsSesInfo *ses,
+static int
-                        struct mid_q_entry *midQ,
+wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
-                        unsigned long timeout,
-                        unsigned long time_to_wait)
 {
-        unsigned long curr_timeout;
+        int error;
-        for (;;) {
+        error = wait_event_killable(server->response_q,
-                curr_timeout = timeout + jiffies;
+                                    midQ->midState != MID_REQUEST_SUBMITTED);
-                wait_event_timeout(ses->server->response_q,
+        if (error < 0)
-                        midQ->midState != MID_REQUEST_SUBMITTED, timeout);
+                return -ERESTARTSYS;
-                if (time_after(jiffies, curr_timeout) &&
+        return 0;
-                        (midQ->midState == MID_REQUEST_SUBMITTED) &&
+}
-                        ((ses->server->tcpStatus == CifsGood) ||
-                         (ses->server->tcpStatus == CifsNew))) {
-                        unsigned long lrt;
-                        /* We timed out. Is the server still
+/*
-                           sending replies ? */
+ * Send a SMB request and set the callback function in the mid to handle
-                        spin_lock(&GlobalMid_Lock);
+ * the result. Caller is responsible for dealing with timeouts.
-                        lrt = ses->server->lstrp;
+ */
-                        spin_unlock(&GlobalMid_Lock);
+int
+cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+                mid_callback_t *callback, void *cbdata)
+{
+        int rc;
+        struct mid_q_entry *mid;
-                        /* Calculate time_to_wait past last receive time.
+        rc = wait_for_free_request(server, CIFS_ASYNC_OP);
-                         Although we prefer not to time out if the
+        if (rc)
-                         server is still responding - we will time
+                return rc;
-                         out if the server takes more than 15 (or 45
-                         or 180) seconds to respond to this request
+        mutex_lock(&server->srv_mutex);
-                         and has not responded to any request from
+        mid = AllocMidQEntry(in_buf, server);
-                         other threads on the client within 10 seconds */
+        if (mid == NULL) {
-                        lrt += time_to_wait;
+                mutex_unlock(&server->srv_mutex);
-                        if (time_after(jiffies, lrt)) {
+                return -ENOMEM;
-                                /* No replies for time_to_wait. */
-                                cERROR(1, "server not responding");
-                                return -1;
-                        }
-                } else {
-                        return 0;
-                }
        }
-}
+        /* put it on the pending_mid_q */
+        spin_lock(&GlobalMid_Lock);
+        list_add_tail(&mid->qhead, &server->pending_mid_q);
+        spin_unlock(&GlobalMid_Lock);
+        rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+        if (rc) {
+                mutex_unlock(&server->srv_mutex);
+                goto out_err;
+        }
+        mid->callback = callback;
+        mid->callback_data = cbdata;
+        mid->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+        atomic_inc(&server->inSend);
+#endif
+        rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+#ifdef CONFIG_CIFS_STATS2
+        atomic_dec(&server->inSend);
+        mid->when_sent = jiffies;
+#endif
+        mutex_unlock(&server->srv_mutex);
+        if (rc)
+                goto out_err;
+        return rc;
+out_err:
+        delete_mid(mid);
+        atomic_dec(&server->inFlight);
+        wake_up(&server->request_q);
+        return rc;
+}
 /*
 *
@@ -382,6 +426,81 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
        return rc;
 }
+static int
+sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
+                mid->mid, mid->midState);
+        spin_lock(&GlobalMid_Lock);
+        /* ensure that it's no longer on the pending_mid_q */
+        list_del_init(&mid->qhead);
+        switch (mid->midState) {
+        case MID_RESPONSE_RECEIVED:
+                spin_unlock(&GlobalMid_Lock);
+                return rc;
+        case MID_REQUEST_SUBMITTED:
+                /* socket is going down, reject all calls */
+                if (server->tcpStatus == CifsExiting) {
+                        cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
+                               __func__, mid->mid, mid->command, mid->midState);
+                        rc = -EHOSTDOWN;
+                        break;
+                }
+        case MID_RETRY_NEEDED:
+                rc = -EAGAIN;
+                break;
+        default:
+                cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
+                        mid->mid, mid->midState);
+                rc = -EIO;
+        }
+        spin_unlock(&GlobalMid_Lock);
+        DeleteMidQEntry(mid);
+        return rc;
+}
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+                struct mid_q_entry *mid)
+{
+        int rc = 0;
+        /* -4 for RFC1001 length and +2 for BCC field */
+        in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4  + 2;
+        in_buf->Command = SMB_COM_NT_CANCEL;
+        in_buf->WordCount = 0;
+        put_bcc_le(0, in_buf);
+        mutex_lock(&server->srv_mutex);
+        rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+        if (rc) {
+                mutex_unlock(&server->srv_mutex);
+                return rc;
+        }
+        rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+        mutex_unlock(&server->srv_mutex);
+        cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
+                in_buf->Mid, rc);
+        return rc;
+}
 int
 SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
             struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -390,7 +509,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        int rc = 0;
        int long_op;
        unsigned int receive_len;
-        unsigned long timeout;
        struct mid_q_entry *midQ;
        struct smb_hdr *in_buf = iov[0].iov_base;
@@ -413,7 +531,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
           to the same server. We may make this configurable later or
           use ses->maxReq */
-        rc = wait_for_free_request(ses, long_op);
+        rc = wait_for_free_request(ses->server, long_op);
        if (rc) {
                cifs_small_buf_release(in_buf);
                return rc;
@@ -457,65 +575,20 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        if (rc < 0)
                goto out;
-        if (long_op == CIFS_STD_OP)
+        if (long_op == CIFS_ASYNC_OP)
-                timeout = 15 * HZ;
-        else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
-                timeout = 180 * HZ;
-        else if (long_op == CIFS_LONG_OP)
-                timeout = 45 * HZ; /* should be greater than
-                        servers oplock break timeout (about 43 seconds) */
-        else if (long_op == CIFS_ASYNC_OP)
                goto out;
-        else if (long_op == CIFS_BLOCKING_OP)
-                timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
-        else {
-                cERROR(1, "unknown timeout flag %d", long_op);
-                rc = -EIO;
-                goto out;
-        }
-        /* wait for 15 seconds or until woken up due to response arriving or
-           due to last connection to this server being unmounted */
-        if (signal_pending(current)) {
-                /* if signal pending do not hold up user for full smb timeout
-                but we still give response a chance to complete */
-                timeout = 2 * HZ;
-        }
-        /* No user interrupts in wait - wreaks havoc with performance */
-        wait_for_response(ses, midQ, timeout, 10 * HZ);
-        spin_lock(&GlobalMid_Lock);
-        if (midQ->resp_buf == NULL) {
+        rc = wait_for_response(ses->server, midQ);
-                cERROR(1, "No response to cmd %d mid %d",
+        if (rc != 0)
-                        midQ->command, midQ->mid);
+                goto out;
-                if (midQ->midState == MID_REQUEST_SUBMITTED) {
-                        if (ses->server->tcpStatus == CifsExiting)
-                                rc = -EHOSTDOWN;
-                        else {
-                                ses->server->tcpStatus = CifsNeedReconnect;
-                                midQ->midState = MID_RETRY_NEEDED;
-                        }
-                }
-                if (rc != -EHOSTDOWN) {
+        rc = sync_mid_result(midQ, ses->server);
-                        if (midQ->midState == MID_RETRY_NEEDED) {
+        if (rc != 0) {
-                                rc = -EAGAIN;
-                                cFYI(1, "marking request for retry");
-                        } else {
-                                rc = -EIO;
-                        }
-                }
-                spin_unlock(&GlobalMid_Lock);
-                DeleteMidQEntry(midQ);
-                /* Update # of requests on wire to server */
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        spin_unlock(&GlobalMid_Lock);
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -559,19 +632,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                if (receive_len >= sizeof(struct smb_hdr) - 4
                    /* do not count RFC1001 header */  +
                    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
-                        BCC(midQ->resp_buf) =
+                        put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
-                                le16_to_cpu(BCC_LE(midQ->resp_buf));
                if ((flags & CIFS_NO_RESP) == 0)
                        midQ->resp_buf = NULL;  /* mark it so buf will
                                                   not be freed by
-                                                   DeleteMidQEntry */
+                                                   delete_mid */
        } else {
                rc = -EIO;
                cFYI(1, "Bad MID state?");
        }
 out:
-        DeleteMidQEntry(midQ);
+        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
        wake_up(&ses->server->request_q);
@@ -585,7 +657,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 {
        int rc = 0;
        unsigned int receive_len;
-        unsigned long timeout;
        struct mid_q_entry *midQ;
        if (ses == NULL) {
@@ -610,7 +681,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                return -EIO;
        }
-        rc = wait_for_free_request(ses, long_op);
+        rc = wait_for_free_request(ses->server, long_op);
        if (rc)
                return rc;
@@ -649,64 +720,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        if (rc < 0)
                goto out;
-        if (long_op == CIFS_STD_OP)
+        if (long_op == CIFS_ASYNC_OP)
-                timeout = 15 * HZ;
-        /* wait for 15 seconds or until woken up due to response arriving or
-           due to last connection to this server being unmounted */
-        else if (long_op == CIFS_ASYNC_OP)
                goto out;
-        else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
-                timeout = 180 * HZ;
-        else if (long_op == CIFS_LONG_OP)
-                timeout = 45 * HZ; /* should be greater than
-                        servers oplock break timeout (about 43 seconds) */
-        else if (long_op == CIFS_BLOCKING_OP)
-                timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
-        else {
-                cERROR(1, "unknown timeout flag %d", long_op);
-                rc = -EIO;
-                goto out;
-        }
-        if (signal_pending(current)) {
+        rc = wait_for_response(ses->server, midQ);
-                /* if signal pending do not hold up user for full smb timeout
+        if (rc != 0)
-                but we still give response a chance to complete */
+                goto out;
-                timeout = 2 * HZ;
-        }
-        /* No user interrupts in wait - wreaks havoc with performance */
-        wait_for_response(ses, midQ, timeout, 10 * HZ);
-        spin_lock(&GlobalMid_Lock);
-        if (midQ->resp_buf == NULL) {
-                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid);
-                if (midQ->midState == MID_REQUEST_SUBMITTED) {
-                        if (ses->server->tcpStatus == CifsExiting)
-                                rc = -EHOSTDOWN;
-                        else {
-                                ses->server->tcpStatus = CifsNeedReconnect;
-                                midQ->midState = MID_RETRY_NEEDED;
-                        }
-                }
-                if (rc != -EHOSTDOWN) {
+        rc = sync_mid_result(midQ, ses->server);
-                        if (midQ->midState == MID_RETRY_NEEDED) {
+        if (rc != 0) {
-                                rc = -EAGAIN;
-                                cFYI(1, "marking request for retry");
-                        } else {
-                                rc = -EIO;
-                        }
-                }
-                spin_unlock(&GlobalMid_Lock);
-                DeleteMidQEntry(midQ);
-                /* Update # of requests on wire to server */
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        spin_unlock(&GlobalMid_Lock);
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -748,43 +775,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                if (receive_len >= sizeof(struct smb_hdr) - 4
                    /* do not count RFC1001 header */  +
                    (2 * out_buf->WordCount) + 2 /* bcc */ )
-                        BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+                        put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
        } else {
                rc = -EIO;
                cERROR(1, "Bad MID state?");
        }
 out:
-        DeleteMidQEntry(midQ);
+        delete_mid(midQ);
        atomic_dec(&ses->server->inFlight);
        wake_up(&ses->server->request_q);
        return rc;
 }
-/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
-static int
-send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
-                struct mid_q_entry *midQ)
-{
-        int rc = 0;
-        struct cifsSesInfo *ses = tcon->ses;
-        __u16 mid = in_buf->Mid;
-        header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
-        in_buf->Mid = mid;
-        mutex_lock(&ses->server->srv_mutex);
-        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
-        if (rc) {
-                mutex_unlock(&ses->server->srv_mutex);
-                return rc;
-        }
-        rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-        mutex_unlock(&ses->server->srv_mutex);
-        return rc;
-}
 /* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
   blocking lock to return. */
@@ -807,7 +811,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
        pSMB->hdr.Mid = GetNextMid(ses->server);
        return SendReceive(xid, ses, in_buf, out_buf,
-                        &bytes_returned, CIFS_STD_OP);
+                        &bytes_returned, 0);
 }
 int
@@ -845,7 +849,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                return -EIO;
        }
-        rc = wait_for_free_request(ses, CIFS_BLOCKING_OP);
+        rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
        if (rc)
                return rc;
@@ -863,7 +867,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
        if (rc) {
-                DeleteMidQEntry(midQ);
+                delete_mid(midQ);
                mutex_unlock(&ses->server->srv_mutex);
                return rc;
        }
@@ -880,7 +884,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        mutex_unlock(&ses->server->srv_mutex);
        if (rc < 0) {
-                DeleteMidQEntry(midQ);
+                delete_mid(midQ);
                return rc;
        }
@@ -899,10 +903,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                if (in_buf->Command == SMB_COM_TRANSACTION2) {
                        /* POSIX lock. We send a NT_CANCEL SMB to cause the
                           blocking lock to return. */
+                        rc = send_nt_cancel(ses->server, in_buf, midQ);
-                        rc = send_nt_cancel(tcon, in_buf, midQ);
                        if (rc) {
-                                DeleteMidQEntry(midQ);
+                                delete_mid(midQ);
                                return rc;
                        }
                } else {
@@ -914,47 +917,22 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                        /* If we get -ENOLCK back the lock may have
                           already been removed. Don't exit in this case. */
                        if (rc && rc != -ENOLCK) {
-                                DeleteMidQEntry(midQ);
+                                delete_mid(midQ);
                                return rc;
                        }
                }
-                /* Wait 5 seconds for the response. */
+                if (wait_for_response(ses->server, midQ) == 0) {
-                if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) {
                        /* We got the response - restart system call. */
                        rstart = 1;
                }
        }
-        spin_lock(&GlobalMid_Lock);
+        rc = sync_mid_result(midQ, ses->server);
-        if (midQ->resp_buf) {
+        if (rc != 0)
-                spin_unlock(&GlobalMid_Lock);
-                receive_len = midQ->resp_buf->smb_buf_length;
-        } else {
-                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid);
-                if (midQ->midState == MID_REQUEST_SUBMITTED) {
-                        if (ses->server->tcpStatus == CifsExiting)
-                                rc = -EHOSTDOWN;
-                        else {
-                                ses->server->tcpStatus = CifsNeedReconnect;
-                                midQ->midState = MID_RETRY_NEEDED;
-                        }
-                }
-                if (rc != -EHOSTDOWN) {
-                        if (midQ->midState == MID_RETRY_NEEDED) {
-                                rc = -EAGAIN;
-                                cFYI(1, "marking request for retry");
-                        } else {
-                                rc = -EIO;
-                        }
-                }
-                spin_unlock(&GlobalMid_Lock);
-                DeleteMidQEntry(midQ);
                return rc;
-        }
+        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
                        receive_len, xid);
@@ -998,10 +976,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        if (receive_len >= sizeof(struct smb_hdr) - 4
            /* do not count RFC1001 header */  +
            (2 * out_buf->WordCount) + 2 /* bcc */ )
-                BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+                put_bcc(get_bcc_le(out_buf), out_buf);
 out:
-        DeleteMidQEntry(midQ);
+        delete_mid(midQ);
        if (rstart && rc == -EACCES)
                return -ERESTARTSYS;
        return rc;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 5525e1c660fd..690157876184 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -20,10 +20,9 @@
 #include <linux/spinlock.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 static atomic_t permission_epoch = ATOMIC_INIT(0);
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 602240569c89..6475877b0763 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -7,9 +7,8 @@
 #include <linux/time.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
 {
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 000000000000..c910b5eb1ceb
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
+/* Coda filesystem -- Linux Minicache
+ *
+ * Copyright (C) 1989 - 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project. Contact Peter Braam
+ * <coda@cs.cmu.edu>
+ */
+#ifndef _CFSNC_HEADER_
+#define _CFSNC_HEADER_
+/* credential cache */
+void coda_cache_enter(struct inode *inode, int mask);
+void coda_cache_clear_inode(struct inode *);
+void coda_cache_clear_all(struct super_block *sb);
+int coda_cache_check(struct inode *inode, int mask);
+/* for downcalls and attributes and lookups */
+void coda_flag_inode_children(struct inode *inode, int flag);
+#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 000000000000..e35071b1de0e
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
+/*
+ *  coda_fs_i.h
+ *
+ *  Copyright (C) 1998 Carnegie Mellon University
+ *
+ */
+#ifndef _LINUX_CODA_FS_I
+#define _LINUX_CODA_FS_I
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/coda.h>
+/*
+ * coda fs inode data
+ * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
+ * c_cached_perm.
+ * vfs_inode is set only when the inode is created and never changes.
+ * c_fid is set when the inode is created and should be considered immutable.
+ */
+struct coda_inode_info {
+        struct CodaFid     c_fid;       /* Coda identifier */
+        u_short            c_flags;     /* flags (see below) */
+        unsigned int       c_mapcount;  /* nr of times this inode is mapped */
+        unsigned int       c_cached_epoch; /* epoch for cached permissions */
+        vuid_t             c_uid;       /* fsuid for cached permissions */
+        unsigned int       c_cached_perm; /* cached access permissions */
+        spinlock_t         c_lock;
+        struct inode       vfs_inode;
+};
+/*
+ * coda fs file private data
+ */
+#define CODA_MAGIC 0xC0DAC0DA
+struct coda_file_info {
+        int                cfi_magic;     /* magic number */
+        struct file       *cfi_container; /* container file for this cnode */
+        unsigned int       cfi_mapcount;  /* nr of times this file is mapped */
+};
+#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
+/* flags */
+#define C_VATTR       0x1   /* Validity of vattr in inode */
+#define C_FLUSH       0x2   /* used after a flush */
+#define C_DYING       0x4   /* from venus (which died) */
+#define C_PURGE       0x8
+int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
+int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
+void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
+#endif
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index bf4a3fd3c8e3..2bdbcc11b373 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -17,9 +17,8 @@
 #include <linux/string.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
 /* initialize the debugging variables */
 int coda_fake_statfs;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 000000000000..9b0c5323890b
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
+/* 
+ * Coda File System, Linux Kernel module
+ * 
+ * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
+ * Linux modifications (C) 1996, Peter J. Braam
+ * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project.
+ */
+#ifndef _LINUX_CODA_FS
+#define _LINUX_CODA_FS
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/wait.h>         
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "coda_fs_i.h"
+/* operations */
+extern const struct inode_operations coda_dir_inode_operations;
+extern const struct inode_operations coda_file_inode_operations;
+extern const struct inode_operations coda_ioctl_inode_operations;
+extern const struct dentry_operations coda_dentry_operations;
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
+/* operations shared over more than one file */
+int coda_open(struct inode *i, struct file *f);
+int coda_release(struct inode *i, struct file *f);
+int coda_permission(struct inode *inode, int mask, unsigned int flags);
+int coda_revalidate_inode(struct dentry *);
+int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_setattr(struct dentry *, struct iattr *);
+/* this file:  heloers */
+char *coda_f2s(struct CodaFid *f);
+int coda_isroot(struct inode *i);
+int coda_iscontrol(const char *name, size_t length);
+void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
+void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
+unsigned short coda_flags_to_cflags(unsigned short);
+/* sysctl.h */
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+#define CODA_ALLOC(ptr, cast, size) do { \
+    if (size < PAGE_SIZE) \
+        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+    else \
+        ptr = (cast)vmalloc((unsigned long) size); \
+    if (!ptr) \
+        printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
+    else memset( ptr, 0, size ); \
+} while (0)
+#define CODA_FREE(ptr,size) \
+    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+/* inode to cnode access functions */
+static inline struct coda_inode_info *ITOC(struct inode *inode)
+{
+        return list_entry(inode, struct coda_inode_info, vfs_inode);
+}
+static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
+{
+        return &(ITOC(inode)->c_fid);
+}
+static __inline__ char *coda_i2s(struct inode *inode)
+{
+        return coda_f2s(&(ITOC(inode)->c_fid));
+}
+/* this will not zap the inode away */
+static __inline__ void coda_flag_inode(struct inode *inode, int flag)
+{
+        struct coda_inode_info *cii = ITOC(inode);
+        spin_lock(&cii->c_lock);
+        cii->c_flags |= flag;
+        spin_unlock(&cii->c_lock);
+}               
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 29badd91360f..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -23,10 +23,9 @@
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 #include "coda_int.h"
@@ -61,7 +60,7 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
-static const struct dentry_operations coda_dentry_operations =
+const struct dentry_operations coda_dentry_operations =
 {
        .d_revalidate   = coda_dentry_revalidate,
        .d_delete       = coda_dentry_delete,
@@ -126,8 +125,6 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
                return ERR_PTR(error);
 exit:
-        d_set_d_op(entry, &coda_dentry_operations);
        if (inode && (type & CODA_NOCACHE))
                coda_flag_inode(inode, C_VATTR | C_PURGE);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index c8b50ba4366a..0433057be330 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -21,10 +21,9 @@
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 #include "coda_int.h"
 static ssize_t
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 50dc7d189f56..871b27715465 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -28,10 +28,9 @@
 #include <linux/vmalloc.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 #include "coda_int.h"
@@ -45,7 +44,7 @@ static struct kmem_cache * coda_inode_cachep;
 static struct inode *coda_alloc_inode(struct super_block *sb)
 {
        struct coda_inode_info *ei;
-        ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+        ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        memset(&ei->c_fid, 0, sizeof(struct CodaFid));
@@ -193,6 +192,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_blocksize_bits = 12;
        sb->s_magic = CODA_SUPER_MAGIC;
        sb->s_op = &coda_super_operations;
+        sb->s_d_op = &coda_dentry_operations;
        sb->s_bdi = &vc->bdi;
        /* get root fid from Venus: this needs the root inode */
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 741f0bd03918..6cbb3afb36dc 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -19,10 +19,10 @@
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 62647a8595e4..8f616e0e252c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -43,10 +43,10 @@
 #include <asm/uaccess.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include "coda_linux.h"
 #include "coda_int.h"
 /* statistics */
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index af78f007a2b0..ab94ef63caef 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -16,9 +16,9 @@
 #include <linux/pagemap.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
 static int coda_symlink_filler(struct file *file, struct page *page)
 {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c3563cab9758..9727e0c52579 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,10 +33,9 @@
 #include <linux/vfs.h>
 #include <linux/coda.h>
-#include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
-#include <linux/coda_cache.h>
+#include "coda_cache.h"
 #include "coda_int.h"
diff --git a/fs/compat.c b/fs/compat.c
index eb1740ac8c0a..f6fd0a00e6cc 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 }
 /*
- * The following statfs calls are copies of code from fs/open.c and
+ * The following statfs calls are copies of code from fs/statfs.c and
 * should be checked against those from time to time
 */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
@@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
            __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
            __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
            __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
-            __put_user(kbuf->f_frsize, &ubuf->f_frsize))
+            __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+            __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+            __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
                return -EFAULT;
        return 0;
 }
@@ -597,10 +599,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
        if (nr_segs > fast_segs) {
                ret = -ENOMEM;
                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-                if (iov == NULL) {
+                if (iov == NULL)
-                        *ret_pointer = fast_pointer;
                        goto out;
-                }
        }
        *ret_pointer = iov;
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0b..9febcdefdfdc 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
 config CONFIGFS_FS
        tristate "Userspace-driven configuration filesystem"
-        depends on SYSFS
+        select SYSFS
        help
-          configfs is a ram-based filesystem that provides the converse
+          configfs is a RAM-based filesystem that provides the converse
          of sysfs's functionality. Where sysfs is a filesystem-based
          view of kernel objects, configfs is a filesystem-based manager
          of kernel objects, or config_items.
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 026cf68553a4..82bda8fdfc1c 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -90,6 +90,7 @@ extern const struct file_operations configfs_file_operations;
 extern const struct file_operations bin_fops;
 extern const struct inode_operations configfs_dir_inode_operations;
 extern const struct inode_operations configfs_symlink_inode_operations;
+extern const struct dentry_operations configfs_dentry_ops;
 extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
                            const char *symname);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 36637a8c1ed3..90ff3cb10de3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static int configfs_d_delete(const struct dentry *dentry)
        return 1;
 }
-static const struct dentry_operations configfs_dentry_ops = {
+const struct dentry_operations configfs_dentry_ops = {
        .d_iput         = configfs_d_iput,
        /* simple_delete_dentry() isn't exported */
        .d_delete       = configfs_d_delete,
@@ -442,7 +442,6 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
                return error;
        }
-        d_set_d_op(dentry, &configfs_dentry_ops);
        d_rehash(dentry);
        return 0;
@@ -489,7 +488,6 @@ static struct dentry * configfs_lookup(struct inode *dir,
                 */
                if (dentry->d_name.len > NAME_MAX)
                        return ERR_PTR(-ENAMETOOLONG);
-                d_set_d_op(dentry, &configfs_dentry_ops);
                d_add(dentry, NULL);
                return NULL;
        }
@@ -683,7 +681,6 @@ static int create_default_group(struct config_group *parent_group,
        ret = -ENOMEM;
        child = d_alloc(parent, &name);
        if (child) {
-                d_set_d_op(child, &configfs_dentry_ops);
                d_add(child, NULL);
                ret = configfs_attach_group(&parent_group->cg_item,
@@ -1681,7 +1678,6 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
        err = -ENOMEM;
        dentry = d_alloc(configfs_sb->s_root, &name);
        if (dentry) {
-                d_set_d_op(dentry, &configfs_dentry_ops);
                d_add(dentry, NULL);
                err = configfs_attach_group(sd->s_element, &group->cg_item,
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 7d3607febe1c..ecc62178beda 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -101,6 +101,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
        configfs_root_group.cg_item.ci_dentry = root;
        root->d_fsdata = &configfs_root;
        sb->s_root = root;
+        sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
        return 0;
 }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 32fd5fe9ca0e..e141939080f0 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops;
 static DEFINE_MUTEX(read_mutex);
-/* These two macros may change in future, to provide better st_ino
+/* These macros may change in future, to provide better st_ino semantics. */
-   semantics. */
-#define CRAMINO(x)      (((x)->offset && (x)->size)?(x)->offset<<2:1)
 #define OFFSET(x)       ((x)->i_ino)
-static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
+static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
 {
+        if (!cino->offset)
+                return offset + 1;
+        if (!cino->size)
+                return offset + 1;
+        /*
+         * The file mode test fixes buggy mkcramfs implementations where
+         * cramfs_inode->offset is set to a non zero value for entries
+         * which did not contain data, like devices node and fifos.
+         */
+        switch (cino->mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                return cino->offset << 2;
+        default:
+                break;
+        }
+        return offset + 1;
+}
+static struct inode *get_cramfs_inode(struct super_block *sb,
+        struct cramfs_inode *cramfs_inode, unsigned int offset)
+{
+        struct inode *inode;
        static struct timespec zerotime;
+        inode = iget_locked(sb, cramino(cramfs_inode, offset));
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        switch (cramfs_inode->mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_fop = &generic_ro_fops;
+                inode->i_data.a_ops = &cramfs_aops;
+                break;
+        case S_IFDIR:
+                inode->i_op = &cramfs_dir_inode_operations;
+                inode->i_fop = &cramfs_directory_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &page_symlink_inode_operations;
+                inode->i_data.a_ops = &cramfs_aops;
+                break;
+        default:
+                init_special_inode(inode, cramfs_inode->mode,
+                                old_decode_dev(cramfs_inode->size));
+        }
        inode->i_mode = cramfs_inode->mode;
        inode->i_uid = cramfs_inode->uid;
-        inode->i_size = cramfs_inode->size;
-        inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
        inode->i_gid = cramfs_inode->gid;
+        /* if the lower 2 bits are zero, the inode contains data */
+        if (!(inode->i_ino & 3)) {
+                inode->i_size = cramfs_inode->size;
+                inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+        }
        /* Struct copy intentional */
        inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
        /* inode->i_nlink is left 1 - arguably wrong for directories,
           but it's the best we can do without reading the directory
           contents.  1 yields the right result in GNU find, even
           without -noleaf option. */
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_fop = &generic_ro_fops;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &cramfs_dir_inode_operations;
-                inode->i_fop = &cramfs_directory_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                inode->i_op = &page_symlink_inode_operations;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else {
-                init_special_inode(inode, inode->i_mode,
-                        old_decode_dev(cramfs_inode->size));
-        }
-}
-static struct inode *get_cramfs_inode(struct super_block *sb,
+        unlock_new_inode(inode);
-                                struct cramfs_inode * cramfs_inode)
-{
-        struct inode *inode;
-        if (CRAMINO(cramfs_inode) == 1) {
-                inode = new_inode(sb);
-                if (inode) {
-                        inode->i_ino = 1;
-                        setup_inode(inode, cramfs_inode);
-                }
-        } else {
-                inode = iget_locked(sb, CRAMINO(cramfs_inode));
-                if (inode && (inode->i_state & I_NEW)) {
-                        setup_inode(inode, cramfs_inode);
-                        unlock_new_inode(inode);
-                }
-        }
        return inode;
 }
@@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
                printk(KERN_ERR "cramfs: root is not a directory\n");
                goto out;
        }
+        /* correct strange, hard-coded permissions of mkcramfs */
+        super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
        root_offset = super.root.offset << 2;
        if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
                sbi->size=super.size;
@@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
        /* Set it all up.. */
        sb->s_op = &cramfs_ops;
-        root = get_cramfs_inode(sb, &super.root);
+        root = get_cramfs_inode(sb, &super.root, 0);
        if (!root)
                goto out;
        sb->s_root = d_alloc_root(root);
@@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 */
                namelen = de->namelen << 2;
                memcpy(buf, name, namelen);
-                ino = CRAMINO(de);
+                ino = cramino(de, OFFSET(inode) + offset);
                mode = de->mode;
                mutex_unlock(&read_mutex);
                nextoffset = offset + sizeof(*de) + namelen;
@@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
                struct cramfs_inode *de;
                char *name;
                int namelen, retval;
+                int dir_off = OFFSET(dir) + offset;
-                de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
+                de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
                name = (char *)(de+1);
                /* Try to take advantage of sorted directories */
@@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
                if (!retval) {
                        struct cramfs_inode entry = *de;
                        mutex_unlock(&read_mutex);
-                        d_add(dentry, get_cramfs_inode(dir->i_sb, &entry));
+                        d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
                        return NULL;
                }
                /* else (retval < 0) */
diff --git a/fs/dcache.c b/fs/dcache.c
index 5699d4c027cb..2a6bd9a4ae97 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -176,6 +176,7 @@ static void d_free(struct dentry *dentry)
 /**
 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * @dentry: the target dentry
 * After this call, in-progress rcu-walk path lookup will fail. This
 * should be called after unhashing, and after changing d_inode (if
 * the dentry has not already been unhashed).
@@ -281,6 +282,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 /**
 * d_kill - kill dentry and return parent
 * @dentry: dentry to kill
+ * @parent: parent dentry
 *
 * The dentry must already be unhashed and removed from the LRU.
 *
@@ -1320,6 +1322,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
                __dget_dlock(parent);
                dentry->d_parent = parent;
                dentry->d_sb = parent->d_sb;
+                d_set_d_op(dentry, dentry->d_sb->s_d_op);
                list_add(&dentry->d_u.d_child, &parent->d_subdirs);
                spin_unlock(&parent->d_lock);
        }
@@ -1335,6 +1338,7 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
        struct dentry *dentry = d_alloc(NULL, name);
        if (dentry) {
                dentry->d_sb = sb;
+                d_set_d_op(dentry, dentry->d_sb->s_d_op);
                dentry->d_parent = dentry;
                dentry->d_flags |= DCACHE_DISCONNECTED;
        }
@@ -1355,8 +1359,8 @@ EXPORT_SYMBOL(d_alloc_name);
 void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 {
-        BUG_ON(dentry->d_op);
+        WARN_ON_ONCE(dentry->d_op);
-        BUG_ON(dentry->d_flags & (DCACHE_OP_HASH        |
+        WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH  |
                                DCACHE_OP_COMPARE       |
                                DCACHE_OP_REVALIDATE    |
                                DCACHE_OP_DELETE ));
@@ -1378,8 +1382,11 @@ EXPORT_SYMBOL(d_set_d_op);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
        spin_lock(&dentry->d_lock);
-        if (inode)
+        if (inode) {
+                if (unlikely(IS_AUTOMOUNT(inode)))
+                        dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
                list_add(&dentry->d_alias, &inode->i_dentry);
+        }
        dentry->d_inode = inode;
        dentry_rcuwalk_barrier(dentry);
        spin_unlock(&dentry->d_lock);
@@ -1507,6 +1514,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
                res = d_alloc(NULL, &name);
                if (res) {
                        res->d_sb = root_inode->i_sb;
+                        d_set_d_op(res, res->d_sb->s_d_op);
                        res->d_parent = res;
                        d_instantiate(res, root_inode);
                }
@@ -1567,6 +1575,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        /* attach a disconnected dentry */
        spin_lock(&tmp->d_lock);
        tmp->d_sb = inode->i_sb;
+        d_set_d_op(tmp, tmp->d_sb->s_d_op);
        tmp->d_inode = inode;
        tmp->d_flags |= DCACHE_DISCONNECTED;
        list_add(&tmp->d_alias, &inode->i_dentry);
@@ -2449,8 +2458,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 }
 /**
- * Prepend path string to a buffer
+ * prepend_path - Prepend path string to a buffer
- *
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry (may be modified by this function)
 * @buffer: pointer to the end of the buffer
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 85882f6ba5f7..b044705eedd4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
-static int
+static void
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                sector_t first_sector, int nr_vecs)
 {
        struct bio *bio;
+        /*
+         * bio_alloc() is guaranteed to return a bio when called with
+         * __GFP_WAIT and we request a valid number of vectors.
+         */
        bio = bio_alloc(GFP_KERNEL, nr_vecs);
        bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
        dio->bio = bio;
        dio->logical_offset_in_bio = dio->cur_page_fs_offset;
-        return 0;
 }
 /*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
                goto out;
        sector = start_sector << (dio->blkbits - 9);
        nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+        nr_pages = min(nr_pages, BIO_MAX_PAGES);
        BUG_ON(nr_pages <= 0);
-        ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
+        dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
        dio->boundary = 0;
 out:
        return ret;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e8116..1897eb1b4b6a 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
 menuconfig DLM
        tristate "Distributed Lock Manager (DLM)"
        depends on EXPERIMENTAL && INET
-        depends on SYSFS && (IPV6 || IPV6=n)
+        depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
-        select CONFIGFS_FS
        select IP_SCTP
        help
        A general purpose distributed lock manager for kernel or userspace
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622a..9c64ae9e4c1a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
 #define NEEDED_RMEM (4*1024*1024)
 #define CONN_HASH_SIZE 32
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
 struct cbuf {
        unsigned int base;
        unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
+#define CF_APP_LIMITED 7
        struct list_head writequeue;  /* List of outgoing writequeue_entries */
        spinlock_t writequeue_lock;
        int (*rx_action) (struct connection *); /* What to do when active */
@@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk)
 {
        struct connection *con = sock2con(sk);
-        if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+        if (!con)
+                return;
+        clear_bit(SOCK_NOSPACE, &con->sock->flags);
+        if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+                con->sock->sk->sk_write_pending--;
+                clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+        }
+        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
                queue_work(send_workqueue, &con->swork);
 }
@@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con)
        struct sockaddr_storage saddr, src_addr;
        int addr_len;
        struct socket *sock = NULL;
+        int one = 1;
        if (con->nodeid == 0) {
                log_print("attempt to connect sock 0 foiled");
@@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con)
        make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
        log_print("connecting to %d", con->nodeid);
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result =
                sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
                                   O_NONBLOCK);
@@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
                goto create_out;
        }
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
                                   (char *)&one, sizeof(one));
@@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con)
        const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
        struct writequeue_entry *e;
        int len, offset;
+        int count = 0;
        mutex_lock(&con->sock_mutex);
        if (con->sock == NULL)
@@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con)
                        ret = kernel_sendpage(con->sock, e->page, offset, len,
                                              msg_flags);
                        if (ret == -EAGAIN || ret == 0) {
+                                if (ret == -EAGAIN &&
+                                    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+                                    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+                                        /* Notify TCP that we're limited by the
+                                         * application window size.
+                                         */
+                                        set_bit(SOCK_NOSPACE, &con->sock->flags);
+                                        con->sock->sk->sk_write_pending++;
+                                }
                                cond_resched();
                                goto out;
                        }
                        if (ret <= 0)
                                goto send_error;
                }
-                        /* Don't starve people filling buffers */
+                /* Don't starve people filling buffers */
+                if (++count >= MAX_SEND_MSG_COUNT) {
                        cond_resched();
+                        count = 0;
+                }
                spin_lock(&con->writequeue_lock);
                e->offset += ret;
@@ -1430,20 +1468,19 @@ static void work_stop(void)
 static int work_start(void)
 {
-        int error;
+        recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
-        recv_workqueue = create_workqueue("dlm_recv");
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        error = IS_ERR(recv_workqueue);
+        if (!recv_workqueue) {
-        if (error) {
+                log_print("can't start dlm_recv");
-                log_print("can't start dlm_recv %d", error);
+                return -ENOMEM;
-                return error;
        }
-        send_workqueue = create_singlethread_workqueue("dlm_send");
+        send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
-        error = IS_ERR(send_workqueue);
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        if (error) {
+        if (!send_workqueue) {
-                log_print("can't start dlm_send %d", error);
+                log_print("can't start dlm_send");
                destroy_workqueue(recv_workqueue);
-                return error;
+                return -ENOMEM;
        }
        return 0;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e7..bfd8b680e648 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
        BUG_ON(!crypt_stat || !crypt_stat->tfm
               || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
+                ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
                                crypt_stat->key_size);
                ecryptfs_dump_hex(crypt_stat->key,
                                  crypt_stat->key_size);
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
        rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
                                (extent_base + extent_offset));
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error attempting to "
+                ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
-                                "derive IV for extent [0x%.16x]; "
+                        "extent [0x%.16llx]; rc = [%d]\n",
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        (unsigned long long)(extent_base + extent_offset), rc);
-                                rc);
                goto out;
        }
        if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
        }
        rc = 0;
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
+                ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        "rc = [%d]\n",
-                                rc);
+                        (unsigned long long)(extent_base + extent_offset), rc);
                ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
                                "encryption:\n");
                ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
        rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
                                (extent_base + extent_offset));
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error attempting to "
+                ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
-                                "derive IV for extent [0x%.16x]; "
+                        "extent [0x%.16llx]; rc = [%d]\n",
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        (unsigned long long)(extent_base + extent_offset), rc);
-                                rc);
                goto out;
        }
        if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
        }
        rc = 0;
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; "
+                ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
-                                "rc = [%d]\n", (extent_base + extent_offset),
+                        "rc = [%d]\n",
-                                rc);
+                        (unsigned long long)(extent_base + extent_offset), rc);
                ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
                                "decryption:\n");
                ecryptfs_dump_hex((char *)(page_address(page)
@@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
        }
        ecryptfs_printk(KERN_DEBUG,
                        "Initializing cipher [%s]; strlen = [%d]; "
-                        "key_size_bits = [%d]\n",
+                        "key_size_bits = [%zd]\n",
                        crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
                        crypt_stat->key_size << 3);
        if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 413a3c48f0bb..dbc84ed96336 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
                (((struct user_key_payload*)key->payload.data)->data);
 }
-#define ECRYPTFS_SUPER_MAGIC 0xf15f
 #define ECRYPTFS_MAX_KEYSET_SIZE 1024
 #define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64
@@ -584,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
 #define ecryptfs_printk(type, fmt, arg...) \
        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+__attribute__ ((format(printf, 1, 2)))
 void __ecryptfs_printk(const char *fmt, ...);
 extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 91da02987bff..81e10e6a9443 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
                                const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos)
 {
-        int rc;
+        ssize_t rc;
        struct dentry *lower_dentry;
        struct vfsmount *lower_vfsmount;
        struct file *file = iocb->ki_filp;
@@ -191,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                                      | ECRYPTFS_ENCRYPTED);
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        if (!ecryptfs_inode_to_private(inode)->lower_file) {
+        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        if (rc) {
-                if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
-                        printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the persistent file for the dentry with name "
-                               "the persistent file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
-                               "[%s]; rc = [%d]\n", __func__,
+                        ecryptfs_dentry->d_name.name, rc);
-                               ecryptfs_dentry->d_name.name, rc);
+                goto out_free;
-                        goto out_free;
-                }
        }
-        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
-            && !(file->f_flags & O_RDONLY)) {
+            == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
                rc = -EPERM;
                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
                       "file must hence be opened RO\n", __func__);
@@ -243,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                }
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
+        ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
-                        "size: [0x%.16x]\n", inode, inode->i_ino,
+                        "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
-                        i_size_read(inode));
+                        (unsigned long long)i_size_read(inode));
        goto out;
 out_free:
        kmem_cache_free(ecryptfs_file_info_cache,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 337352a94751..bd33f87a1907 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
                                "context; rc = [%d]\n", rc);
                goto out;
        }
-        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        if (rc) {
-                if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
-                        printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the persistent file for the dentry with name "
-                               "the persistent file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
-                               "[%s]; rc = [%d]\n", __func__,
+                        ecryptfs_dentry->d_name.name, rc);
-                               ecryptfs_dentry->d_name.name, rc);
+                goto out;
-                        goto out;
-                }
        }
        rc = ecryptfs_write_metadata(ecryptfs_dentry);
        if (rc) {
@@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                rc = -ENOMEM;
                goto out;
        }
-        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+        rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+        if (rc) {
-                if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
-                        printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the persistent file for the dentry with name "
-                               "the persistent file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
-                               "[%s]; rc = [%d]\n", __func__,
+                        ecryptfs_dentry->d_name.name, rc);
-                               ecryptfs_dentry->d_name.name, rc);
+                goto out_free_kmem;
-                        goto out_free_kmem;
-                }
        }
        crypt_stat = &ecryptfs_inode_to_private(
                                        ecryptfs_dentry->d_inode)->crypt_stat;
@@ -441,7 +437,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        struct qstr lower_name;
        int rc = 0;
-        d_set_d_op(ecryptfs_dentry, &ecryptfs_dops);
        if ((ecryptfs_dentry->d_name.len == 1
             && !strcmp(ecryptfs_dentry->d_name.name, "."))
            || (ecryptfs_dentry->d_name.len == 2
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index b1f6858a5223..c1436cff6f2d 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
                break;
        default:
                ecryptfs_printk(KERN_WARNING, "Unknown error code: "
-                                "[0x%.16x]\n", err_code);
+                                "[0x%.16lx]\n", err_code);
                rc = -EINVAL;
        }
        return rc;
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
        } else {
                rc = -EINVAL;
                ecryptfs_printk(KERN_WARNING,
-                                "Unsupported packet size: [%d]\n", size);
+                                "Unsupported packet size: [%zd]\n", size);
        }
        return rc;
 }
@@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
               auth_tok->session_key.decrypted_key_size);
        crypt_stat->flags |= ECRYPTFS_KEY_VALID;
        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n",
+                ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
                                crypt_stat->key_size);
                ecryptfs_dump_hex(crypt_stat->key,
                                  crypt_stat->key_size);
@@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
                        if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
                                ecryptfs_printk(KERN_ERR, "Expected "
                                                "signature of size [%d]; "
-                                                "read size [%d]\n",
+                                                "read size [%zd]\n",
                                                ECRYPTFS_SIG_SIZE,
                                                tag_11_contents_size);
                                rc = -EIO;
@@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
                        goto out_wipe_list;
                        break;
                default:
-                        ecryptfs_printk(KERN_DEBUG, "No packet at offset "
+                        ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
-                                        "[%d] of the file header; hex value of "
+                                        "of the file header; hex value of "
                                        "character is [0x%.2x]\n", i, src[i]);
                        next_packet_is_auth_tok_packet = 0;
                }
@@ -1864,8 +1864,8 @@ found_matching_auth_tok:
                                "session key for authentication token with sig "
                                "[%.*s]; rc = [%d]. Removing auth tok "
                                "candidate from the list and searching for "
-                                "the next match.\n", candidate_auth_tok_sig,
+                                "the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
-                                ECRYPTFS_SIG_SIZE_HEX, rc);
+                                candidate_auth_tok_sig, rc);
                list_for_each_entry_safe(auth_tok_list_item,
                                         auth_tok_list_item_tmp,
                                         &auth_tok_list, list) {
@@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        if (encrypted_session_key_valid) {
                ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
                                "using auth_tok->session_key.encrypted_key, "
-                                "where key_rec->enc_key_size = [%d]\n",
+                                "where key_rec->enc_key_size = [%zd]\n",
                                key_rec->enc_key_size);
                memcpy(key_rec->enc_key,
                       auth_tok->session_key.encrypted_key,
@@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        if (rc < 1 || rc > 2) {
                ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
                                "for crypt_stat session key; expected rc = 1; "
-                                "got rc = [%d]. key_rec->enc_key_size = [%d]\n",
+                                "got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
                                rc, key_rec->enc_key_size);
                rc = -ENOMEM;
                goto out;
@@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
                ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
                                "for crypt_stat encrypted session key; "
                                "expected rc = 1; got rc = [%d]. "
-                                "key_rec->enc_key_size = [%d]\n", rc,
+                                "key_rec->enc_key_size = [%zd]\n", rc,
                                key_rec->enc_key_size);
                rc = -ENOMEM;
                goto out;
@@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
                goto out;
        }
        rc = 0;
-        ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
+        ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
                        crypt_stat->key_size);
        rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
                                      (*key_rec).enc_key_size);
@@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        }
        ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
        if (ecryptfs_verbosity > 0) {
-                ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n",
+                ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
                                key_rec->enc_key_size);
                ecryptfs_dump_hex(key_rec->enc_key,
                                  key_rec->enc_key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 351038675376..758323a0f09a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/magic.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -141,25 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
        return rc;
 }
-/**
+static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
- * ecryptfs_interpose
+                       struct super_block *sb)
- * @lower_dentry: Existing dentry in the lower filesystem
- * @dentry: ecryptfs' dentry
- * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
- *
- * Interposes upper and lower dentries.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
-                       struct super_block *sb, u32 flags)
 {
-        struct inode *lower_inode;
        struct inode *inode;
        int rc = 0;
-        lower_inode = lower_dentry->d_inode;
        if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
                rc = -EXDEV;
                goto out;
@@ -189,17 +177,38 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
        if (special_file(lower_inode->i_mode))
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
-        d_set_d_op(dentry, &ecryptfs_dops);
        fsstack_copy_attr_all(inode, lower_inode);
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
        fsstack_copy_inode_size(inode, lower_inode);
+        return inode;
+out:
+        return ERR_PTR(rc);
+}
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ * @flags: flags to govern behavior of interpose procedure
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
+                       struct super_block *sb, u32 flags)
+{
+        struct inode *lower_inode = lower_dentry->d_inode;
+        struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
                d_add(dentry, inode);
        else
                d_instantiate(dentry, inode);
-out:
+        return 0;
-        return rc;
 }
 enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
@@ -492,59 +501,11 @@ struct kmem_cache *ecryptfs_sb_info_cache;
 static struct file_system_type ecryptfs_fs_type;
 /**
- * ecryptfs_read_super
- * @sb: The ecryptfs super block
- * @dev_name: The path to mount over
- *
- * Read the super block of the lower filesystem, and use
- * ecryptfs_interpose to create our initial inode and super block
- * struct.
- */
-static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
-{
-        struct path path;
-        int rc;
-        rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
-        if (rc) {
-                ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
-                goto out;
-        }
-        if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
-                rc = -EINVAL;
-                printk(KERN_ERR "Mount on filesystem of type "
-                        "eCryptfs explicitly disallowed due to "
-                        "known incompatibilities\n");
-                goto out_free;
-        }
-        ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
-        sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
-        sb->s_blocksize = path.dentry->d_sb->s_blocksize;
-        ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
-        ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
-        rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
-        if (rc)
-                goto out_free;
-        rc = 0;
-        goto out;
-out_free:
-        path_put(&path);
-out:
-        return rc;
-}
-/**
 * ecryptfs_get_sb
 * @fs_type
 * @flags
 * @dev_name: The path to mount over
 * @raw_data: The options passed into the kernel
- *
- * The whole ecryptfs_get_sb process is broken into 3 functions:
- * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_read_super(): this accesses the lower filesystem and uses
- *                        ecryptfs_interpose to perform most of the linking
- * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
 */
 static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *raw_data)
@@ -553,6 +514,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        struct ecryptfs_sb_info *sbi;
        struct ecryptfs_dentry_info *root_info;
        const char *err = "Getting sb failed";
+        struct inode *inode;
+        struct path path;
        int rc;
        sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -575,10 +538,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        s->s_flags = flags;
        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
-        if (rc) {
+        if (rc)
-                deactivate_locked_super(s);
+                goto out1;
-                goto out;
-        }
        ecryptfs_set_superblock_private(s, sbi);
        s->s_bdi = &sbi->bdi;
@@ -586,34 +547,55 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        /* ->kill_sb() will take care of sbi after that point */
        sbi = NULL;
        s->s_op = &ecryptfs_sops;
+        s->s_d_op = &ecryptfs_dops;
-        rc = -ENOMEM;
+        err = "Reading sb failed";
-        s->s_root = d_alloc(NULL, &(const struct qstr) {
+        rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
-                             .hash = 0,.name = "/",.len = 1});
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
+                goto out1;
+        }
+        if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Mount on filesystem of type "
+                        "eCryptfs explicitly disallowed due to "
+                        "known incompatibilities\n");
+                goto out_free;
+        }
+        ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
+        s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
+        s->s_blocksize = path.dentry->d_sb->s_blocksize;
+        s->s_magic = ECRYPTFS_SUPER_MAGIC;
+        inode = ecryptfs_get_inode(path.dentry->d_inode, s);
+        rc = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_free;
+        s->s_root = d_alloc_root(inode);
        if (!s->s_root) {
-                deactivate_locked_super(s);
+                iput(inode);
-                goto out;
+                rc = -ENOMEM;
+                goto out_free;
        }
-        d_set_d_op(s->s_root, &ecryptfs_dops);
-        s->s_root->d_sb = s;
-        s->s_root->d_parent = s->s_root;
+        rc = -ENOMEM;
        root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
-        if (!root_info) {
+        if (!root_info)
-                deactivate_locked_super(s);
+                goto out_free;
-                goto out;
-        }
        /* ->kill_sb() will take care of root_info */
        ecryptfs_set_dentry_private(s->s_root, root_info);
+        ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+        ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
        s->s_flags |= MS_ACTIVE;
-        rc = ecryptfs_read_super(s, dev_name);
-        if (rc) {
-                deactivate_locked_super(s);
-                err = "Reading sb failed";
-                goto out;
-        }
        return dget(s->s_root);
+out_free:
+        path_put(&path);
+out1:
+        deactivate_locked_super(s);
 out:
        if (sbi) {
                ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
@@ -828,9 +810,10 @@ static int __init ecryptfs_init(void)
                ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
                                "larger than the host's page size, and so "
                                "eCryptfs cannot run on this system. The "
-                                "default eCryptfs extent size is [%d] bytes; "
+                                "default eCryptfs extent size is [%u] bytes; "
-                                "the page size is [%d] bytes.\n",
+                                "the page size is [%lu] bytes.\n",
-                                ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
+                                ECRYPTFS_DEFAULT_EXTENT_SIZE,
+                                (unsigned long)PAGE_CACHE_SIZE);
                goto out;
        }
        rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544b..cc64fca89f8d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
        rc = ecryptfs_encrypt_page(page);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error encrypting "
-                                "page (upper index [0x%.16x])\n", page->index);
+                                "page (upper index [0x%.16lx])\n", page->index);
                ClearPageUptodate(page);
                goto out;
        }
@@ -237,7 +237,7 @@ out:
                ClearPageUptodate(page);
        else
                SetPageUptodate(page);
-        ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+        ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
                        page->index);
        unlock_page(page);
        return rc;
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
                return -ENOMEM;
        *pagep = page;
+        prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
        if (!PageUptodate(page)) {
                struct ecryptfs_crypt_stat *crypt_stat =
                        &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
                                SetPageUptodate(page);
                        }
                } else {
-                        rc = ecryptfs_decrypt_page(page);
+                        if (prev_page_end_size
-                        if (rc) {
+                            >= i_size_read(page->mapping->host)) {
-                                printk(KERN_ERR "%s: Error decrypting page "
+                                zero_user(page, 0, PAGE_CACHE_SIZE);
-                                       "at index [%ld]; rc = [%d]\n",
+                        } else {
-                                       __func__, page->index, rc);
+                                rc = ecryptfs_decrypt_page(page);
-                                ClearPageUptodate(page);
+                                if (rc) {
-                                goto out;
+                                        printk(KERN_ERR "%s: Error decrypting "
+                                               "page at index [%ld]; "
+                                               "rc = [%d]\n",
+                                               __func__, page->index, rc);
+                                        ClearPageUptodate(page);
+                                        goto out;
+                                }
                        }
                        SetPageUptodate(page);
                }
        }
-        prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
        /* If creating a page or more of holes, zero them out via truncate.
         * Note, this will increase i_size. */
        if (index != 0) {
@@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file,
        } else
                ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
        ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
-                        "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+                        "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
                rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
                                                       to);
@@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file,
        rc = fill_zeros_to_end_of_page(page, to);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
-                        "zeros in page with index = [0x%.16x]\n", index);
+                        "zeros in page with index = [0x%.16lx]\n", index);
                goto out;
        }
        rc = ecryptfs_encrypt_page(page);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
-                                "index [0x%.16x])\n", index);
+                                "index [0x%.16lx])\n", index);
                goto out;
        }
        if (pos + copied > i_size_read(ecryptfs_inode)) {
                i_size_write(ecryptfs_inode, pos + copied);
                ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
-                                "[0x%.16x]\n", i_size_read(ecryptfs_inode));
+                        "[0x%.16llx]\n",
+                        (unsigned long long)i_size_read(ecryptfs_inode));
        }
        rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
        if (rc)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8cf07242067d..cc8a9b7d6064 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -217,7 +217,7 @@ struct ep_send_events_data {
 * Configuration options available inside /proc/sys/fs/epoll/
 */
 /* Maximum number of epoll watched descriptors, per user */
-static int max_user_watches __read_mostly;
+static long max_user_watches __read_mostly;
 /*
 * This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -240,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
 #include <linux/sysctl.h>
-static int zero;
+static long zero;
+static long long_max = LONG_MAX;
 ctl_table epoll_table[] = {
        {
                .procname       = "max_user_watches",
                .data           = &max_user_watches,
-                .maxlen         = sizeof(int),
+                .maxlen         = sizeof(max_user_watches),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = &zero,
+                .extra2         = &long_max,
        },
        { }
 };
@@ -561,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        /* At this point it is safe to free the eventpoll item */
        kmem_cache_free(epi_cache, epi);
-        atomic_dec(&ep->user->epoll_watches);
+        atomic_long_dec(&ep->user->epoll_watches);
        return 0;
 }
@@ -898,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 {
        int error, revents, pwake = 0;
        unsigned long flags;
+        long user_watches;
        struct epitem *epi;
        struct ep_pqueue epq;
-        if (unlikely(atomic_read(&ep->user->epoll_watches) >=
+        user_watches = atomic_long_read(&ep->user->epoll_watches);
-                     max_user_watches))
+        if (unlikely(user_watches >= max_user_watches))
                return -ENOSPC;
        if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
                return -ENOMEM;
@@ -966,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
        spin_unlock_irqrestore(&ep->lock, flags);
-        atomic_inc(&ep->user->epoll_watches);
+        atomic_long_inc(&ep->user->epoll_watches);
        /* We have to call this outside the lock */
        if (pwake)
@@ -1426,6 +1429,7 @@ static int __init eventpoll_init(void)
         */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
+        BUG_ON(max_user_watches < 0);
        /* Initialize the structure used to perform safe poll wait head wake ups */
        ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2709b34206ab..47cda410b548 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,21 +28,30 @@
 typedef struct ext2_dir_entry_2 ext2_dirent;
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
 static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
 {
        unsigned len = le16_to_cpu(dlen);
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == EXT2_MAX_REC_LEN)
                return 1 << 16;
+#endif
        return len;
 }
 static inline __le16 ext2_rec_len_to_disk(unsigned len)
 {
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == (1 << 16))
                return cpu_to_le16(EXT2_MAX_REC_LEN);
        else
                BUG_ON(len > (1 << 16));
+#endif
        return cpu_to_le16(len);
 }
@@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet)
                p = (ext2_dirent *)(kaddr + offs);
                rec_len = ext2_rec_len_from_disk(p->rec_len);
-                if (rec_len < EXT2_DIR_REC_LEN(1))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
                        goto Eshort;
-                if (rec_len & 3)
+                if (unlikely(rec_len & 3))
                        goto Ealign;
-                if (rec_len < EXT2_DIR_REC_LEN(p->name_len))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
                        goto Enamelen;
-                if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
                        goto Espan;
-                if (le32_to_cpu(p->inode) > max_inumber)
+                if (unlikely(le32_to_cpu(p->inode) > max_inumber))
                        goto Einumber;
        }
        if (offs != limit)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index f8aecd2e3297..2e1d8341d827 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
        inode = NULL;
        if (ino) {
                inode = ext2_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext2_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e0c6380ff992..7731695e65d9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
-void ext2_error (struct super_block * sb, const char * function,
+void ext2_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = sbi->s_es;
@@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function,
        }
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function,
 void ext2_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f84700be3274..c2e4dce984d2 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -199,14 +199,6 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
                        goto found;
                entry = next;
        }
-        /* Check the remaining name entries */
-        while (!IS_LAST_ENTRY(entry)) {
-                struct ext2_xattr_entry *next =
-                        EXT2_XATTR_NEXT(entry);
-                if ((char *)next >= end)
-                        goto bad_block;
-                entry = next;
-        }
        if (ext2_xattr_cache_insert(bh))
                ea_idebug(inode, "cache insert failed");
        error = -ENODATA;
@@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
 /*
 * ext2_xattr_set()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b3db22649426..045995c8ce5a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
 #include <linux/ext3_jbd.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 /*
 * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+        ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+        if (offsetp)
+                *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+        if (blockgrpp)
+                *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
 /**
 * ext3_get_group_desc() -- load group descriptor from disk
 * @sb:                 super block
@@ -1885,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
        return ext3_bg_num_gdb_meta(sb,group);
 }
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                 super block for file system
+ * @group:              allocation group to trim
+ * @start:              first group block to examine
+ * @max:                last group block to examine
+ * @gdp:                allocation group description structure
+ * @minblocks:          minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
+                                ext3_grpblk_t start, ext3_grpblk_t max,
+                                ext3_grpblk_t minblocks)
+{
+        handle_t *handle;
+        ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+        ext3_fsblk_t discard_block;
+        struct ext3_sb_info *sbi;
+        struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+        struct ext3_group_desc *gdp;
+        int err = 0, ret = 0;
+        /*
+         * We will update one block bitmap, and one group descriptor
+         */
+        handle = ext3_journal_start_sb(sb, 2);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        bitmap_bh = read_block_bitmap(sb, group);
+        if (!bitmap_bh) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(bitmap_bh, "getting undo access");
+        err = ext3_journal_get_undo_access(handle, bitmap_bh);
+        if (err)
+                goto err_out;
+        gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+        if (!gdp) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(gdp_bh, "get_write_access");
+        err = ext3_journal_get_write_access(handle, gdp_bh);
+        if (err)
+                goto err_out;
+        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+        sbi = EXT3_SB(sb);
+         /* Walk through the whole group */
+        while (start < max) {
+                start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+                if (start < 0)
+                        break;
+                next = start;
+                /*
+                 * Allocate contiguous free extents by setting bits in the
+                 * block bitmap
+                 */
+                while (next < max
+                        && claim_block(sb_bgl_lock(sbi, group),
+                                        next, bitmap_bh)) {
+                        next++;
+                }
+                 /* We did not claim any blocks */
+                if (next == start)
+                        continue;
+                discard_block = (ext3_fsblk_t)start +
+                                ext3_group_first_block_no(sb, group);
+                /* Update counters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+                /* Do not issue a TRIM on extents smaller than minblocks */
+                if ((next - start) < minblocks)
+                        goto free_extent;
+                 /* Send the TRIM command down to the device */
+                err = sb_issue_discard(sb, discard_block, next - start,
+                                       GFP_NOFS, 0);
+                count += (next - start);
+free_extent:
+                freed = 0;
+                /*
+                 * Clear bits in the bitmap
+                 */
+                for (bit = start; bit < next; bit++) {
+                        BUFFER_TRACE(bitmap_bh, "clear bit");
+                        if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+                                                bit, bitmap_bh->b_data)) {
+                                ext3_error(sb, __func__,
+                                        "bit already cleared for block "E3FSBLK,
+                                         (unsigned long)bit);
+                                BUFFER_TRACE(bitmap_bh, "bit already cleared");
+                        } else {
+                                freed++;
+                        }
+                }
+                /* Update couters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+                start = next;
+                if (err < 0) {
+                        if (err != -EOPNOTSUPP)
+                                ext3_warning(sb, __func__, "Discard command "
+                                             "returned error %d\n", err);
+                        break;
+                }
+                if (fatal_signal_pending(current)) {
+                        err = -ERESTARTSYS;
+                        break;
+                }
+                cond_resched();
+                /* No more suitable extents */
+                if ((free_blocks - count) < minblocks)
+                        break;
+        }
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+        if (!err)
+                err = ret;
+        /* And the group descriptor block */
+        BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+        ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+        if (!err)
+                err = ret;
+        ext3_debug("trimmed %d blocks in the group %d\n",
+                count, group);
+err_out:
+        if (err)
+                count = err;
+        ext3_journal_stop(handle);
+        brelse(bitmap_bh);
+        return count;
+}
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb:                 superblock for filesystem
+ * @start:              First Byte to trim
+ * @len:                number of Bytes to trim from start
+ * @minlen:             minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        ext3_grpblk_t last_block, first_block, free_blocks;
+        unsigned long first_group, last_group;
+        unsigned long group, ngroups;
+        struct ext3_group_desc *gdp;
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        uint64_t start, len, minlen, trimmed;
+        ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+        int ret = 0;
+        start = range->start >> sb->s_blocksize_bits;
+        len = range->len >> sb->s_blocksize_bits;
+        minlen = range->minlen >> sb->s_blocksize_bits;
+        trimmed = 0;
+        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+                return -EINVAL;
+        if (start >= max_blks)
+                goto out;
+        if (start < le32_to_cpu(es->s_first_data_block)) {
+                len -= le32_to_cpu(es->s_first_data_block) - start;
+                start = le32_to_cpu(es->s_first_data_block);
+        }
+        if (start + len > max_blks)
+                len = max_blks - start;
+        ngroups = EXT3_SB(sb)->s_groups_count;
+        smp_rmb();
+        /* Determine first and last group to examine based on start and len */
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+                                     &first_group, &first_block);
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+                                     &last_group, &last_block);
+        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+        last_block = EXT3_BLOCKS_PER_GROUP(sb);
+        if (first_group > last_group)
+                return -EINVAL;
+        for (group = first_group; group <= last_group; group++) {
+                gdp = ext3_get_group_desc(sb, group, NULL);
+                if (!gdp)
+                        break;
+                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+                if (free_blocks < minlen)
+                        continue;
+                if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+                        len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+                else
+                        last_block = first_block + len;
+                ret = ext3_trim_all_free(sb, group, first_block,
+                                        last_block, minlen);
+                if (ret < 0)
+                        break;
+                trimmed += ret;
+                first_block = 0;
+        }
+        if (ret >= 0)
+                ret = 0;
+out:
+        range->len = trimmed * sb->s_blocksize;
+        return ret;
+}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf6..34f0a072b935 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
        const char * error_msg = NULL;
        const int rlen = ext3_rec_len_from_disk(de->rec_len);
-        if (rlen < EXT3_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
-        if (error_msg != NULL)
+        if (unlikely(error_msg != NULL))
                ext3_error (dir->i_sb, function,
                        "bad entry in directory #%lu: %s - "
                        "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
                        dir->i_ino, error_msg, offset,
                        (unsigned long) le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a9580617edd2..ae94f6d949f5 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2145,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        if (ext3_journal_dirty_metadata(handle, bh))
+                                return;
                }
                ext3_mark_inode_dirty(handle, inode);
                truncate_restart_transaction(handle, inode);
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
-                        ext3_journal_get_write_access(handle, bh);
+                        if (ext3_journal_get_write_access(handle, bh))
+                                return;
                }
        }
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783a..fc080dd561f7 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -276,7 +276,29 @@ group_add_out:
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
+        case FITRIM: {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                                   sizeof(range)))
+                        return -EFAULT;
+                ret = ext3_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                                 sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index bce9dce639b8..b27ba71810ec 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        struct buffer_head * bh_use[NAMEI_RA_SIZE];
        struct buffer_head * bh, *ret = NULL;
        unsigned long start, block, b;
+        const u8 *name = entry->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        namelen = entry->len;
        if (namelen > EXT3_NAME_LEN)
                return NULL;
+        if ((namelen <= 2) && (name[0] == '.') &&
+            (name[1] == '.' || name[1] == 0)) {
+                /*
+                 * "." or ".." will only be in the first block
+                 * NFS may look up ".."; "." should be handled by the VFS
+                 */
+                block = start = 0;
+                nblocks = 1;
+                goto restart;
+        }
        if (is_dx(dir)) {
                bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
                /*
@@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
                        struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
                        int *err)
 {
-        struct super_block * sb;
+        struct super_block *sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-        u32 hash;
        struct dx_frame frames[2], *frame;
-        struct ext3_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        unsigned long block;
        int retval;
-        int namelen = entry->len;
-        const u8 *name = entry->name;
-        sb = dir->i_sb;
+        if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-        /* NFS may look up ".." - look at dx_root directory block */
+                return NULL;
-        if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
-                if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-                        return NULL;
-        } else {
-                frame = frames;
-                frame->bh = NULL;                       /* for dx_release() */
-                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-        }
-        hash = hinfo.hash;
        do {
                block = dx_get_block(frame->at);
                if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
                        goto errout;
-                de = (struct ext3_dir_entry_2 *) bh->b_data;
-                top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                       EXT3_DIR_REC_LEN(0));
-                for (; de < top; de = ext3_next_entry(de)) {
-                        int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
-                                  + ((char *) de - bh->b_data);
-                        if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
-                                brelse(bh);
-                                *err = ERR_BAD_DX_DIR;
-                                goto errout;
-                        }
-                        if (ext3_match(namelen, name, de)) {
+                retval = search_dirblock(bh, dir, entry,
-                                *res_dir = de;
+                                         block << EXT3_BLOCK_SIZE_BITS(sb),
-                                dx_release(frames);
+                                         res_dir);
-                                return bh;
+                if (retval == 1) {
-                        }
+                        dx_release(frames);
+                        return bh;
                }
-                brelse (bh);
+                brelse(bh);
+                if (retval == -1) {
+                        *err = ERR_BAD_DX_DIR;
+                        goto errout;
+                }
                /* Check to see if we should continue to search */
-                retval = ext3_htree_next_block(dir, hash, frame,
+                retval = ext3_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext3_warning(sb, __func__,
@@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
                        return ERR_PTR(-EIO);
                }
                inode = ext3_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext3_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
@@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext3_journal_dirty_metadata(handle, frames[0].bh);
+                err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+                if (err)
+                        goto journal_error;
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle,
                if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
+                        int err;
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext3_journal_get_write_access(handle, bh);
+                        err = ext3_journal_get_write_access(handle, bh);
+                        if (err)
+                                goto journal_error;
                        if (pde)
                                pde->rec_len = ext3_rec_len_to_disk(
                                        ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        err = ext3_journal_dirty_metadata(handle, bh);
+                        if (err) {
+journal_error:
+                                ext3_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext3_rec_len_from_disk(de->rec_len);
@@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 {
        handle_t *handle;
        struct inode * inode;
-        struct buffer_head * dir_block;
+        struct buffer_head * dir_block = NULL;
        struct ext3_dir_entry_2 * de;
        int err, retries = 0;
@@ -1790,15 +1793,14 @@ retry:
        inode->i_fop = &ext3_dir_operations;
        inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
-        if (!dir_block) {
+        if (!dir_block)
-                drop_nlink(inode); /* is this nlink == 0? */
+                goto out_clear_inode;
-                unlock_new_inode(inode);
-                ext3_mark_inode_dirty(handle, inode);
-                iput (inode);
-                goto out_stop;
-        }
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext3_journal_get_write_access(handle, dir_block);
+        err = ext3_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext3_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1814,11 +1816,16 @@ retry:
        ext3_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
-        ext3_journal_dirty_metadata(handle, dir_block);
+        err = ext3_journal_dirty_metadata(handle, dir_block);
-        brelse (dir_block);
+        if (err)
-        ext3_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext3_add_entry (handle, dentry, inode);
+        err = ext3_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext3_add_entry (handle, dentry, inode);
        if (err) {
+out_clear_inode:
                inode->i_nlink = 0;
                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
@@ -1827,10 +1834,14 @@ retry:
        }
        inc_nlink(dir);
        ext3_update_dx_flag(dir);
-        ext3_mark_inode_dirty(handle, dir);
+        err = ext3_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                        goto end_rename;
        } else {
                BUFFER_TRACE(new_bh, "get write access");
-                ext3_journal_get_write_access(handle, new_bh);
+                retval = ext3_journal_get_write_access(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                new_de->inode = cpu_to_le32(old_inode->i_ino);
                if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
                                              EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
                ext3_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, new_bh);
+                retval = ext3_journal_dirty_metadata(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
        ext3_update_dx_flag(old_dir);
        if (dir_bh) {
                BUFFER_TRACE(dir_bh, "get_write_access");
-                ext3_journal_get_write_access(handle, dir_bh);
+                retval = ext3_journal_get_write_access(handle, dir_bh);
+                if (retval)
+                        goto journal_error;
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, dir_bh);
+                retval = ext3_journal_dirty_metadata(handle, dir_bh);
+                if (retval) {
+journal_error:
+                        ext3_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                drop_nlink(old_dir);
                if (new_inode) {
                        drop_nlink(new_inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index e746d30b1232..108b142e11ed 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(gdb);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(it);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, it);
+                err = ext3_journal_dirty_metadata(handle, it);
+                if (err) {
+                        brelse(it);
+                        goto exit_bh;
+                }
                brelse(it);
                ext3_set_bit(bit, bh->b_data);
        }
@@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
+        if (err)
+                goto exit_bh;
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
@@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
 exit_bh:
        brelse(bh);
@@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
-        ext3_journal_dirty_metadata(handle, dind);
+        err = ext3_journal_dirty_metadata(handle, dind);
+        if (err)
+                goto exit_group_desc;
        brelse(dind);
+        dind = NULL;
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
-        ext3_mark_iloc_dirty(handle, inode, &iloc);
+        err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+        if (err)
+                goto exit_group_desc;
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext3_journal_dirty_metadata(handle, *primary);
+        err = ext3_journal_dirty_metadata(handle, *primary);
+        if (err)
+                goto exit_group_desc;
        o_group_desc = EXT3_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        if (err)
+                goto exit_inode;
        return 0;
+exit_group_desc:
+        kfree(n_group_desc);
 exit_inode:
        //ext3_journal_release_buffer(handle, iloc.bh);
        brelse(iloc.bh);
@@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb,
                }
                ext3_debug("update metadata backup %#04lx\n",
                          (unsigned long)bh->b_blocknr);
-                if ((err = ext3_journal_get_write_access(handle, bh)))
+                if ((err = ext3_journal_get_write_access(handle, bh))) {
+                        brelse(bh);
                        break;
+                }
                lock_buffer(bh);
                memcpy(bh->b_data, data, size);
                if (rest)
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext3_journal_dirty_metadata(handle, bh);
+                err = ext3_journal_dirty_metadata(handle, bh);
                brelse(bh);
+                if (err)
+                        break;
        }
        if ((err2 = ext3_journal_stop(handle)) && !err)
                err = err2;
@@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext3_journal_dirty_metadata(handle, primary);
+        err = ext3_journal_dirty_metadata(handle, primary);
+        if (err)
+                goto exit_journal;
        /* Update the reserved block counts only once the new group is
         * active. */
@@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT3_INODES_PER_GROUP(sb));
-        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 exit_journal:
        mutex_unlock(&sbi->s_resize_lock);
@@ -1064,8 +1095,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                goto exit_put;
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+        if (err) {
+                ext3_warning(sb, __func__,
+                             "error %d on journal dirty metadata", err);
+                ext3_journal_stop(handle);
+                goto exit_put;
+        }
        ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
                   o_blocks_count, o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 77ce1616f725..85c8cc8f2473 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -143,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
 void ext3_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
@@ -195,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb)
                        sb->s_id);
 }
-void ext3_error (struct super_block * sb, const char * function,
+void ext3_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        ext3_handle_error(sb);
@@ -274,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
 * case we take the easy way out and panic immediately.
 */
-void ext3_abort (struct super_block * sb, const char * function,
+void ext3_abort(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                 const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -300,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function,
                journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
-void ext3_warning (struct super_block * sb, const char * function,
+void ext3_warning(struct super_block *sb, const char *function,
-                   const char * fmt, ...)
+                  const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
@@ -346,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
-        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
        if (IS_ERR(bdev))
                goto fail;
        return bdev;
@@ -363,8 +381,7 @@ fail:
 */
 static int ext3_blkdev_put(struct block_device *bdev)
 {
-        bd_release(bdev);
+        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -737,7 +754,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path);
+                         struct path *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1848,13 +1865,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        if (generic_check_addressable(sb->s_blocksize_bits,
+        err = generic_check_addressable(sb->s_blocksize_bits,
-                                      le32_to_cpu(es->s_blocks_count))) {
+                                        le32_to_cpu(es->s_blocks_count));
+        if (err) {
                ext3_msg(sb, KERN_ERR,
                        "error: filesystem is too large to mount safely");
                if (sizeof(sector_t) < 8)
                        ext3_msg(sb, KERN_ERR,
                                "error: CONFIG_LBDAF not enabled");
+                ret = err;
                goto failed_mount;
        }
@@ -2142,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        if (bdev == NULL)
                return NULL;
-        if (bd_claim(bdev, sb)) {
-                ext3_msg(sb, KERN_ERR,
-                        "error: failed to claim external journal device");
-                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-                return NULL;
-        }
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
@@ -2297,7 +2309,7 @@ static int ext3_load_journal(struct super_block *sb,
        EXT3_SB(sb)->s_journal = journal;
        ext3_clear_journal_err(sb, es);
-        if (journal_devnum &&
+        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
@@ -2865,27 +2877,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name)
+                         struct path *path)
 {
        int err;
-        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb)
-                path_put(&path);
                return -EXDEV;
-        }
        /* Journaling quota? */
        if (EXT3_SB(sb)->s_qf_names[type]) {
                /* Quotafile not of fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        ext3_msg(sb, KERN_WARNING,
                                "warning: Quota file not on filesystem root. "
                                "Journaled quota will not work.");
@@ -2895,7 +2900,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
         * When we journal data on quota file, we have to flush journal to see
         * all updates to the file when we bypass pagecache...
         */
-        if (ext3_should_journal_data(path.dentry->d_inode)) {
+        if (ext3_should_journal_data(path->dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -2903,15 +2908,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
                journal_lock_updates(EXT3_SB(sb)->s_journal);
                err = journal_flush(EXT3_SB(sb)->s_journal);
                journal_unlock_updates(EXT3_SB(sb)->s_journal);
-                if (err) {
+                if (err)
-                        path_put(&path);
                        return err;
-                }
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        return dquot_quota_on(sb, type, format_id, path);
-        path_put(&path);
-        return err;
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa89..32e6cc23bd9a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext3_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 14c3af26c671..adf96b822781 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -592,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
-        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
+        if (!(*errp) &&
+            ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ece76fb6a40c..164c56092e58 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
        return (ext4_filetype_table[filetype]);
 }
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
-                           struct inode *dir,
+                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
                           struct buffer_head *bh,
                           unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
-        if (rlen < EXT4_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely(((char *) de - bh->b_data) + rlen >
+                          dir->i_sb->s_blocksize))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
+        else
+                return 0;
-        if (error_msg != NULL)
+        if (filp)
-                ext4_error_inode(dir, function, line, bh->b_blocknr,
+                ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
-                        "bad entry in directory: %s - "
+                                "bad entry in directory: %s - offset=%u(%u), "
-                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+                                "inode=%u, rec_len=%d, name_len=%d",
-                        error_msg, (unsigned) (offset%bh->b_size), offset,
+                                error_msg, (unsigned) (offset%bh->b_size),
-                        le32_to_cpu(de->inode),
+                                offset, le32_to_cpu(de->inode),
-                        rlen, de->name_len);
+                                rlen, de->name_len);
-        return error_msg == NULL ? 1 : 0;
+        else
+                ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+                                "bad entry in directory: %s - offset=%u(%u), "
+                                "inode=%u, rec_len=%d, name_len=%d",
+                                error_msg, (unsigned) (offset%bh->b_size),
+                                offset, le32_to_cpu(de->inode),
+                                rlen, de->name_len);
+        return 1;
 }
 static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                EXT4_ERROR_INODE(inode, "directory "
+                                EXT4_ERROR_FILE(filp, 0,
-                                           "contains a hole at offset %Lu",
+                                                "directory contains a "
+                                                "hole at offset %llu",
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
@@ -194,8 +210,8 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (!ext4_check_dir_entry(inode, de,
+                        if (ext4_check_dir_entry(inode, filp, de,
-                                                  bh, offset)) {
+                                                 bh, offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
                                 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94ce3d7a1c4b..0c8d97b56f34 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
 #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)                 \
        ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
-#define EXT4_ERROR_FILE(file, fmt, a...)        \
+#define EXT4_ERROR_FILE(file, block, fmt, a...)                         \
-        ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
+        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -561,23 +561,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
 #endif
+/* Max physical block we can address w/o extents */
-/*
- *  Mount options
- */
-struct ext4_mount_options {
-        unsigned long s_mount_opt;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned long s_commit_interval;
-        u32 s_min_batch_time, s_max_batch_time;
-#ifdef CONFIG_QUOTA
-        int s_jquota_fmt;
-        char *s_qf_names[MAXQUOTAS];
-#endif
-};
-/* Max physical block we can addres w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF
 /*
@@ -709,6 +693,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
                ext4_decode_extra_time(&(inode)->xtime,                        \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (inode)->xtime.tv_nsec = 0;                                    \
 } while (0)
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                        \
@@ -719,6 +705,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))            \
                ext4_decode_extra_time(&(einode)->xtime,                       \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (einode)->xtime.tv_nsec = 0;                                   \
 } while (0)
 #define i_disk_version osd1.linux1.l_i_version
@@ -750,12 +738,13 @@ do {									       \
 /*
 * storage for cached extent
+ * If ec_len == 0, then the cache is invalid.
+ * If ec_start == 0, then the cache represents a gap (null mapping)
 */
 struct ext4_ext_cache {
        ext4_fsblk_t    ec_start;
        ext4_lblk_t     ec_block;
        __u32           ec_len; /* must be 32bit to return holes */
-        __u32           ec_type;
 };
 /*
@@ -774,10 +763,12 @@ struct ext4_inode_info {
         * near to their parent directory's inode.
         */
        ext4_group_t    i_block_group;
+        ext4_lblk_t     i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
        unsigned long   i_state_flags;          /* Dynamic state flags */
+#endif
        unsigned long   i_flags;
-        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
@@ -820,7 +811,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
-        struct jbd2_inode jinode;
+        struct jbd2_inode *jinode;
        struct ext4_ext_cache i_cached_extent;
        /*
@@ -840,14 +831,12 @@ struct ext4_inode_info {
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
-        unsigned short i_delalloc_reserved_flag;
+        ext4_lblk_t i_da_metadata_calc_last_lblock;
-        sector_t i_da_metadata_calc_last_lblock;
        int i_da_metadata_calc_len;
        /* on-disk additional length */
        __u16 i_extra_isize;
-        spinlock_t i_block_reservation_lock;
 #ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
@@ -856,9 +845,11 @@ struct ext4_inode_info {
        /* completed IOs that might need unwritten extents handling */
        struct list_head i_completed_io_list;
        spinlock_t i_completed_io_lock;
+        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
-        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
+        spinlock_t i_block_reservation_lock;
        /*
         * Transactions that contain inode's metadata needed to complete
@@ -917,11 +908,20 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
-#define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
+#define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
-#define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
+                                                ~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
+                                                EXT4_MOUNT_##opt
 #define test_opt(sb, opt)               (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)
+#define clear_opt2(sb, opt)             EXT4_SB(sb)->s_mount_opt2 &= \
+                                                ~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)               EXT4_SB(sb)->s_mount_opt2 |= \
+                                                EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
+                                         EXT4_MOUNT2_##opt)
 #define ext4_set_bit                    ext2_set_bit
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
 #define ext4_clear_bit                  ext2_clear_bit
@@ -1087,6 +1087,7 @@ struct ext4_sb_info {
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
        struct buffer_head **s_group_desc;
        unsigned int s_mount_opt;
+        unsigned int s_mount_opt2;
        unsigned int s_mount_flags;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
@@ -1237,24 +1238,39 @@ enum {
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
+        EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
 };
-#define EXT4_INODE_BIT_FNS(name, field)                                 \
+#define EXT4_INODE_BIT_FNS(name, field, offset)                         \
 static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);     \
 }                                                                       \
 static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);             \
 }                                                                       \
 static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
 {                                                                       \
-        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
+        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);           \
 }
-EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(flag, flags, 0)
-EXT4_INODE_BIT_FNS(state, state_flags)
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        (ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        /* We depend on the fact that callers will set i_flags */
+}
+#endif
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1642,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, de, bh, offset) \
+#define ext4_check_dir_entry(dir, filp, de, bh, offset)                 \
-        __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
+        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+                                        (de), (bh), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1653,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
 extern int ext4_sync_file(struct file *, int);
+extern int ext4_flush_completed_IO(struct inode *);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1752,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
                             ext4_fsblk_t, const char *, ...)
        __attribute__ ((format (printf, 5, 6)));
 extern void ext4_error_file(struct file *, const char *, unsigned int,
-                            const char *, ...)
+                            ext4_fsblk_t, const char *, ...)
-        __attribute__ ((format (printf, 4, 5)));
+        __attribute__ ((format (printf, 5, 6)));
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
 extern void __ext4_abort(struct super_block *, const char *, unsigned int,
@@ -2046,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 28ce70fd9cd0..2e29abb30f76 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,10 +119,6 @@ struct ext4_ext_path {
 * structure for external API
 */
-#define EXT4_EXT_CACHE_NO       0
-#define EXT4_EXT_CACHE_GAP      1
-#define EXT4_EXT_CACHE_EXTENT   2
 /*
 * to be called by ext4_ext_walk_space()
 * negative retcode - error
@@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode)
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
-        EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
+        EXT4_I(inode)->i_cached_extent.ec_len = 0;
 }
 static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -278,7 +274,7 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
 }
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-                                         sector_t lblocks);
+                                         ext4_lblk_t lblocks);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c5..d8b992e658c1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
        if (ext4_handle_valid(handle))
-                return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+                return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
        return 0;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0554c48cb1fd..63a75810b7c3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -117,11 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                struct ext4_extent *ex;
                depth = path->p_depth;
-                /* try to predict block placement */
+                /*
+                 * Try to predict block placement assuming that we are
+                 * filling in a file which will eventually be
+                 * non-sparse --- i.e., in the case of libbfd writing
+                 * an ELF object sections out-of-order but in a way
+                 * the eventually results in a contiguous object or
+                 * executable file, or some database extending a table
+                 * space file.  However, this is actually somewhat
+                 * non-ideal if we are writing a sparse file such as
+                 * qemu or KVM writing a raw image file that is going
+                 * to stay fairly sparse, since it will end up
+                 * fragmenting the file system's free space.  Maybe we
+                 * should have some hueristics or some way to allow
+                 * userspace to pass a hint to file system,
+                 * especiially if the latter case turns out to be
+                 * common.
+                 */
                ex = path[depth].p_ext;
-                if (ex)
+                if (ex) {
-                        return (ext4_ext_pblock(ex) +
+                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
-                                (block - le32_to_cpu(ex->ee_block)));
+                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+                        if (block > ext_block)
+                                return ext_pblk + (block - ext_block);
+                        else
+                                return ext_pblk - (ext_block - block);
+                }
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@ -244,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 * to allocate @blocks
 * Worse case is one block per extent
 */
-int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        int idxs, num = 0;
@@ -1872,12 +1894,10 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        cbex.ec_block = start;
                        cbex.ec_len = end - start;
                        cbex.ec_start = 0;
-                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
                        cbex.ec_start = ext4_ext_pblock(ex);
-                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
                if (unlikely(cbex.ec_len == 0)) {
@@ -1917,13 +1937,12 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
-                        __u32 len, ext4_fsblk_t start, int type)
+                        __u32 len, ext4_fsblk_t start)
 {
        struct ext4_ext_cache *cex;
        BUG_ON(len == 0);
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
-        cex->ec_type = type;
        cex->ec_block = block;
        cex->ec_len = len;
        cex->ec_start = start;
@@ -1976,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
        }
        ext_debug(" -> %u:%lu\n", lblock, len);
-        ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+        ext4_ext_put_in_cache(inode, lblock, len, 0);
 }
+/*
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
 static int
 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                        struct ext4_extent *ex)
 {
        struct ext4_ext_cache *cex;
-        int ret = EXT4_EXT_CACHE_NO;
+        int ret = 0;
        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
@@ -1993,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        cex = &EXT4_I(inode)->i_cached_extent;
        /* has cache valid data? */
-        if (cex->ec_type == EXT4_EXT_CACHE_NO)
+        if (cex->ec_len == 0)
                goto errout;
-        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
-                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
        if (in_range(block, cex->ec_block, cex->ec_len)) {
                ex->ee_block = cpu_to_le32(cex->ec_block);
                ext4_ext_store_pblock(ex, cex->ec_start);
@@ -2005,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                ext_debug("%u cached by %u:%u:%llu\n",
                                block,
                                cex->ec_block, cex->ec_len, cex->ec_start);
-                ret = cex->ec_type;
+                ret = 1;
        }
 errout:
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2825,14 +2845,14 @@ fix_extent_len:
 * to an uninitialized extent.
 *
 * Writing to an uninitized extent may result in splitting the uninitialized
- * extent into multiple /intialized unintialized extents (up to three)
+ * extent into multiple /initialized uninitialized extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be uninitialized
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * One of more index blocks maybe needed if the extent tree grow after
- * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * the uninitialized extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the uninitialized extent before DIO submit
 * the IO. The uninitialized extent called at this time will be split
 * into three uninitialized extent(at most). After IO complete, the part
@@ -3082,7 +3102,7 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 * Handle EOFBLOCKS_FL flag, clearing it if necessary
 */
 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
-                              struct ext4_map_blocks *map,
+                              ext4_lblk_t lblk,
                              struct ext4_ext_path *path,
                              unsigned int len)
 {
@@ -3112,7 +3132,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
         * this turns out to be false, we can bail out from this
         * function immediately.
         */
-        if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
            ext4_ext_get_actual_len(last_ex))
                return 0;
        /*
@@ -3168,8 +3188,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                                                        path);
                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
-                        err = check_eofblocks_fl(handle, inode, map, path,
+                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
-                                                 map->m_len);
+                                                 path, map->m_len);
                } else
                        err = ret;
                goto out2;
@@ -3199,7 +3219,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
        if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
-                err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+                                         map->m_len);
                if (err < 0)
                        goto out2;
        }
@@ -3276,7 +3297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex;
        ext4_fsblk_t newblock;
-        int err = 0, depth, ret, cache_type;
+        int err = 0, depth, ret;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3285,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                  map->m_lblk, map->m_len, inode->i_ino);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
-        if (cache_type) {
+                if (!newex.ee_start_lo && !newex.ee_start_hi) {
-                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
@@ -3296,7 +3316,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                goto out2;
                        }
                        /* we should allocate requested block */
-                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
+                } else {
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
@@ -3305,8 +3325,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
-                } else {
-                        BUG();
                }
        }
@@ -3357,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* Do not put uninitialized extent in the cache */
                        if (!ext4_ext_is_uninitialized(ex)) {
                                ext4_ext_put_in_cache(inode, ee_block,
-                                                        ee_len, ee_start,
+                                                        ee_len, ee_start);
-                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
                        ret = ext4_ext_handle_uninitialized_extents(handle,
@@ -3456,7 +3473,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
        if (err)
                goto out2;
@@ -3490,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
-                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3519,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode)
        int err = 0;
        /*
+         * finish any pending end_io work so we won't run the risk of
+         * converting any truncated blocks to initialized later
+         */
+        ext4_flush_completed_IO(inode);
+        /*
         * probably first extent we're gonna free will be last in block
         */
        err = ext4_writepage_trans_blocks(inode);
@@ -3605,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
 }
 /*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
 * operation, which gets called from sys_fallocate system call.
 * For block-mapped files, posix_fallocate should fall back to the method
 * of writing zeroes to the required new blocks (the same behavior which is
 * expected for file systems which do not support fallocate() system call).
 */
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        handle_t *handle;
        loff_t new_size;
        unsigned int max_blocks;
@@ -3622,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
        /*
         * currently supporting (pre)allocate mode for extent-based
         * files _only_
@@ -3629,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
-        /* preallocation to directories is currently not supported */
-        if (S_ISDIR(inode->i_mode))
-                return -ENODEV;
        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
@@ -3767,7 +3790,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
        logical =  (__u64)newex->ec_block << blksize_bits;
-        if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+        if (newex->ec_start == 0) {
                pgoff_t offset;
                struct page *page;
                struct buffer_head *bh = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5a5c55ddceef..2e8322c8aa88 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 {
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        struct ext4_inode_info *ei = EXT4_I(inode);
        struct vfsmount *mnt = filp->f_path.mnt;
        struct path path;
        char buf[64], *cp;
@@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                        ext4_mark_super_dirty(sb);
                }
        }
+        /*
+         * Set up the jbd2_inode if we are opening the inode for
+         * writing and the journal is present
+         */
+        if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+                struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+                spin_lock(&inode->i_lock);
+                if (!ei->jinode) {
+                        if (!jinode) {
+                                spin_unlock(&inode->i_lock);
+                                return -ENOMEM;
+                        }
+                        ei->jinode = jinode;
+                        jbd2_journal_init_jbd_inode(ei->jinode, inode);
+                        jinode = NULL;
+                }
+                spin_unlock(&inode->i_lock);
+                if (unlikely(jinode != NULL))
+                        jbd2_free_inode(jinode);
+        }
        return dquot_file_open(inode, filp);
 }
@@ -188,6 +210,7 @@ const struct file_operations ext4_file_operations = {
        .fsync          = ext4_sync_file,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
+        .fallocate      = ext4_fallocate,
 };
 const struct inode_operations ext4_file_inode_operations = {
@@ -201,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .check_acl      = ext4_check_acl,
-        .fallocate      = ext4_fallocate,
        .fiemap         = ext4_fiemap,
 };
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c1a7bc923cf6..7829b287822a 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
 * to written.
 * The function return the number of pending IOs on success.
 */
-static int flush_completed_IO(struct inode *inode)
+extern int ext4_flush_completed_IO(struct inode *inode)
 {
        ext4_io_end_t *io;
        struct ext4_inode_info *ei = EXT4_I(inode);
@@ -169,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync)
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
-        ret = flush_completed_IO(inode);
+        ret = ext4_flush_completed_IO(inode);
        if (ret < 0)
                return ret;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ce240a23ebb..eb9097aec6f0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1027,7 +1027,7 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
        ext4_set_inode_state(inode, EXT4_STATE_NEW);
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e659597b690b..9f7f9e49914f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -54,10 +56,17 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
        trace_ext4_begin_ordered_truncate(inode, new_size);
-        return jbd2_journal_begin_ordered_truncate(
+        /*
-                                        EXT4_SB(inode->i_sb)->s_journal,
+         * If jinode is zero, then we never opened the file for
-                                        &EXT4_I(inode)->jinode,
+         * writing, so there's no need to call
-                                        new_size);
+         * jbd2_journal_begin_ordered_truncate() since there's no
+         * outstanding writes we need to flush.
+         */
+        if (!EXT4_I(inode)->jinode)
+                return 0;
+        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+                                                   EXT4_I(inode)->jinode,
+                                                   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -552,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 }
 /**
- *      ext4_blks_to_allocate: Look up the block map and count the number
+ *      ext4_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -591,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 /**
 *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: inode which needs allocated blocks
+ *      @iblock: the logical block to start allocated at
+ *      @goal: preferred physical block of allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks: number of desired blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: on return it will store the error code
- *              direct blocks
+ *
+ *      This function will return the number of blocks allocated as
+ *      requested by the passed-in parameters.
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                             ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -711,9 +726,11 @@ failed_out:
 /**
 *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -826,6 +843,7 @@ failed:
 /**
 * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
 * @chain: chain of indirect blocks (with a missing link - see
@@ -1081,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 * Calculate the number of metadata blocks need to reserve
 * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1320,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * avoid double accounting
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -1350,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1878,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
 * Reserve a single block located at lblock
 */
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2239,7 +2257,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the magic i_delalloc_reserved_flag once the
+         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
         * inode's allocation semaphore is taken.
         *
         * If the blocks in questions were delalloc blocks, set
@@ -3362,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
-         * the pages by calling redirty_page_for_writeback() but that
+         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
         * simplifying them becuase we wouldn't actually intend to
@@ -3720,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 retry:
        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
        if (!io_end) {
-                if (printk_ratelimit())
+                pr_warn_ratelimited("%s: allocation fail\n", __func__);
-                        printk(KERN_WARNING "%s: allocation fail\n", __func__);
                schedule();
                goto retry;
        }
@@ -3745,9 +3762,9 @@ retry:
 * preallocated extents, and those write extend the file, no need to
 * fall back to buffered IO.
 *
- * For holes, we fallocate those blocks, mark them as unintialized
+ * For holes, we fallocate those blocks, mark them as uninitialized
 * If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * still keep the range to write as uninitialized.
 *
 * The unwrritten extents will be converted to written when DIO is completed.
 * For async direct IO, since the IO may still pending when return, we
@@ -4045,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle,
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
-                if (ext4_should_order_data(inode))
+                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -4169,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 {
        __le32 *p;
        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     err;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4184,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, inode, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(inode->i_sb, err);
+                                return 1;
+                        }
+                }
+                err = ext4_mark_inode_dirty(handle, inode);
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
+                }
+                err = ext4_truncate_restart_trans(handle, inode,
+                                                  blocks_for_truncate(inode));
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
                }
-                ext4_mark_inode_dirty(handle, inode);
-                ext4_truncate_restart_trans(handle, inode,
-                                            blocks_for_truncate(inode));
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext4_journal_get_write_access(handle, bh);
@@ -4349,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
+                        brelse(bh);
                        /*
                         * Everything below this this pointer has been
@@ -4859,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -5118,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5b4d4e3a4d58..851f49b2f9d2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2608,18 +2608,12 @@ int ext4_mb_release(struct super_block *sb)
 static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
-        int ret;
        ext4_fsblk_t discard_block;
        discard_block = block + ext4_group_first_block_no(sb, block_group);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
-        ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-        if (ret == -EOPNOTSUPP) {
-                ext4_warning(sb, "discard not supported, disabling");
-                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-        }
-        return ret;
 }
 /*
@@ -2631,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct super_block *sb = journal->j_private;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
-        int err, count = 0, count2 = 0;
+        int err, ret, count = 0, count2 = 0;
        struct ext4_free_data *entry;
        struct list_head *l, *ltmp;
@@ -2641,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
-                if (test_opt(sb, DISCARD))
+                if (test_opt(sb, DISCARD)) {
-                        ext4_issue_discard(sb, entry->group,
+                        ret = ext4_issue_discard(sb, entry->group,
                                        entry->start_blk, entry->count);
+                        if (unlikely(ret == -EOPNOTSUPP)) {
+                                ext4_warning(sb, "discard not supported, "
+                                                 "disabling");
+                                clear_opt(sb, DISCARD);
+                        }
+                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -3881,19 +3881,6 @@ repeat:
        }
 }
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-                                        struct ext4_buddy *e4b,
-                                        sector_t block, int count)
-{
-        BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
 #ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
@@ -4283,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
         * EDQUOT check, as blocks and quotas have been already
         * reserved when data being copied into pagecache.
         */
-        if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+        if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
        else {
                /* Without delayed allocation we need to verify
@@ -4380,7 +4367,8 @@ out:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, inquota - ar->len);
        if (!ar->len) {
-                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                if (!ext4_test_inode_state(ar->inode,
+                                           EXT4_STATE_DELALLOC_RESERVED))
                        /* release all the reserved blocks if non delalloc */
                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
                                                reserv_blks);
@@ -4626,7 +4614,11 @@ do_more:
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
                 */
-                new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                if (!new_entry) {
+                        err = -ENOMEM;
+                        goto error_return;
+                }
                new_entry->start_blk = bit;
                new_entry->group  = block_group;
                new_entry->count = count;
@@ -4643,7 +4635,6 @@ do_more:
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
-                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4718,8 +4709,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
        ext4_unlock_group(sb, group);
        ret = ext4_issue_discard(sb, group, start, count);
-        if (ret)
-                ext4_std_error(sb, ret);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
@@ -4819,6 +4808,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
        ext4_grpblk_t cnt = 0, first_block, last_block;
        uint64_t start, len, minlen, trimmed;
+        ext4_fsblk_t first_data_blk =
+                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
        int ret = 0;
        start = range->start >> sb->s_blocksize_bits;
@@ -4828,6 +4819,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
                return -EINVAL;
+        if (start < first_data_blk) {
+                len -= first_data_blk - start;
+                start = first_data_blk;
+        }
        /* Determine first and last group to examine based on start and len */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
@@ -4851,7 +4846,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                if (len >= EXT4_BLOCKS_PER_GROUP(sb))
                        len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
                else
-                        last_block = len;
+                        last_block = first_block + len;
                if (e4b.bd_info->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 25f3a974b725..b0a126f23c20 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-                                   S_IFREG, 0, goal);
+                                   S_IFREG, NULL, goal);
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index dc40e75cba88..5485390d32c5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-                if (!ext4_check_dir_entry(dir, de, bh,
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
-                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
-                                                +((char *)de - bh->b_data))) {
+                                         + ((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
                        dir_file->f_pos = (dir_file->f_pos |
                                        (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -1036,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                EXT4_ERROR_INODE(dir,
                                                 "deleted inode referenced: %u",
@@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *)bh->b_data;
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -EIO;
                        if (ext4_match(namelen, name, de))
                                return -EEXIST;
@@ -1602,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                if (err) {
+                        ext4_std_error(inode->i_sb, err);
+                        goto cleanup;
+                }
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1630,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle,
 {
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
-        int i;
+        int i, err;
        i = 0;
        pde = NULL;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        while (i < bh->b_size) {
-                if (!ext4_check_dir_entry(dir, de, bh, i))
+                if (ext4_check_dir_entry(dir, NULL, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
                                        ext4_rec_len_from_disk(pde->rec_len,
@@ -1652,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, dir, bh);
+                        err = ext4_handle_dirty_metadata(handle, dir, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1789,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        handle_t *handle;
        struct inode *inode;
-        struct buffer_head *dir_block;
+        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
@@ -1822,7 +1834,9 @@ retry:
        if (!dir_block)
                goto out_clear_inode;
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext4_journal_get_write_access(handle, dir_block);
+        err = ext4_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1839,10 +1853,12 @@ retry:
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        ext4_handle_dirty_metadata(handle, dir, dir_block);
+        err = ext4_handle_dirty_metadata(handle, dir, dir_block);
-        brelse(dir_block);
+        if (err)
-        ext4_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext4_add_entry(handle, dentry, inode);
+        err = ext4_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext4_add_entry(handle, dentry, inode);
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
@@ -1853,10 +1869,13 @@ out_clear_inode:
        }
        ext4_inc_count(handle, dir);
        ext4_update_dx_flag(dir);
-        ext4_mark_inode_dirty(handle, dir);
+        err = ext4_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -1919,7 +1938,7 @@ static int empty_dir(struct inode *inode)
                        }
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
-                if (!ext4_check_dir_entry(inode, de, bh, offset)) {
+                if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
                                                         sb->s_blocksize);
                        offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2407,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                if (unlikely(retval)) {
+                        ext4_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2459,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                if (retval) {
+                        ext4_std_error(old_dir->i_sb, retval);
+                        goto end_rename;
+                }
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index beacce11ac50..7270dcfca92a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -44,7 +44,7 @@ int __init ext4_init_pageio(void)
        if (io_page_cachep == NULL)
                return -ENOMEM;
        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
-        if (io_page_cachep == NULL) {
+        if (io_end_cachep == NULL) {
                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
        }
@@ -158,11 +158,8 @@ static void ext4_end_io_work(struct work_struct *work)
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 {
-        ext4_io_end_t *io = NULL;
+        ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
-        io = kmem_cache_alloc(io_end_cachep, flags);
        if (io) {
-                memset(io, 0, sizeof(*io));
                atomic_inc(&EXT4_I(inode)->i_ioend_count);
                io->inode = inode;
                INIT_WORK(&io->work, ext4_end_io_work);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 981c8477adab..3ecc6e45d2f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,7 +220,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
+                err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+                if (unlikely(err)) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -258,7 +262,11 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_bh;
+        }
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -270,7 +278,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err))
+                ext4_std_error(sb, err);
 exit_bh:
        brelse(bh);
@@ -422,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                goto exit_dind;
        }
-        if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+        if (unlikely(err))
                goto exit_dind;
-        if ((err = ext4_journal_get_write_access(handle, *primary)))
+        err = ext4_journal_get_write_access(handle, *primary);
+        if (unlikely(err))
                goto exit_sbh;
-        if ((err = ext4_journal_get_write_access(handle, dind)))
+        err = ext4_journal_get_write_access(handle, dind);
-                goto exit_primary;
+        if (unlikely(err))
+                ext4_std_error(sb, err);
        /* ext4_reserve_inode_write() gets a reference on the iloc */
-        if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (unlikely(err))
                goto exit_dindj;
        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -454,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-        ext4_handle_dirty_metadata(handle, NULL, dind);
+        err = ext4_handle_dirty_metadata(handle, NULL, dind);
-        brelse(dind);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext4_handle_dirty_metadata(handle, NULL, *primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, *primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
+        brelse(dind);
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -470,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        if (err)
+                ext4_std_error(sb, err);
-        return 0;
+        return err;
 exit_inode:
        /* ext4_journal_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
        /* ext4_journal_release_buffer(handle, dind); */
-exit_primary:
-        /* ext4_journal_release_buffer(handle, *primary); */
 exit_sbh:
-        /* ext4_journal_release_buffer(handle, *primary); */
+        /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -665,7 +687,9 @@ static void update_backups(struct super_block *sb,
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext4_handle_dirty_metadata(handle, NULL, bh);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
+                if (unlikely(err))
+                        ext4_std_error(sb, err);
                brelse(bh);
        }
        if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -883,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext4_handle_dirty_metadata(handle, NULL, primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_journal;
+        }
        /* Update the reserved block counts only once the new group is
         * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cd37f9d5e447..48ce561fafac 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -388,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb)
 void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line, current->comm);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, current->comm, &vaf);
        va_end(args);
        ext4_handle_error(sb);
@@ -405,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
                      const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        es->s_last_error_block = cpu_to_le64(block);
        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
               inode->i_sb->s_id, function, line, inode->i_ino);
        if (block)
-                printk("block %llu: ", block);
+                printk(KERN_CONT "block %llu: ", block);
-        printk("comm %s: ", current->comm);
+        printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
-        vprintk(fmt, args);
-        printk("\n");
        va_end(args);
        ext4_handle_error(inode->i_sb);
 }
 void ext4_error_file(struct file *file, const char *function,
-                     unsigned int line, const char *fmt, ...)
+                     unsigned int line, ext4_fsblk_t block,
+                     const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es;
        struct inode *inode = file->f_dentry->d_inode;
        char pathname[80], *path;
@@ -434,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function,
        es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        save_error_info(inode->i_sb, function, line);
-        va_start(args, fmt);
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
-        if (!path)
+        if (IS_ERR(path))
                path = "(unknown)";
        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s:%d: inode #%lu "
+               "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               "(comm %s path %s): ",
+               inode->i_sb->s_id, function, line, inode->i_ino);
-               inode->i_sb->s_id, function, line, inode->i_ino,
+        if (block)
-               current->comm, path);
+                printk(KERN_CONT "block %llu: ", block);
-        vprintk(fmt, args);
+        va_start(args, fmt);
-        printk("\n");
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -543,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
                panic("EXT4-fs panic from previous error\n");
 }
-void ext4_msg (struct super_block * sb, const char *prefix,
+void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
 void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, &vaf);
        va_end(args);
 }
@@ -575,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
 __releases(bitlock)
 __acquires(bitlock)
 {
+        struct va_format vaf;
        va_list args;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(ino);
        es->s_last_error_block = cpu_to_le64(block);
        __save_error_info(sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
               sb->s_id, function, line, grp);
        if (ino)
-                printk("inode %lu: ", ino);
+                printk(KERN_CONT "inode %lu: ", ino);
        if (block)
-                printk("block %llu:", (unsigned long long) block);
+                printk(KERN_CONT "block %llu:", (unsigned long long) block);
-        vprintk(fmt, args);
+        printk(KERN_CONT "%pV\n", &vaf);
-        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_CONT)) {
@@ -647,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
-        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
        if (IS_ERR(bdev))
                goto fail;
        return bdev;
@@ -663,8 +673,7 @@ fail:
 */
 static int ext4_blkdev_put(struct block_device *bdev)
 {
-        bd_release(bdev);
+        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -808,21 +817,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
-        /*
-         * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
-         * therefore it can be null here.  Don't check it, just initialize
-         * jinode.
-         */
-        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
        ei->i_da_metadata_calc_len = 0;
-        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
 #endif
+        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_completed_io_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->cur_aio_dio = NULL;
@@ -898,9 +901,12 @@ void ext4_clear_inode(struct inode *inode)
        end_writeback(inode);
        dquot_drop(inode);
        ext4_discard_preallocations(inode);
-        if (EXT4_JOURNAL(inode))
+        if (EXT4_I(inode)->jinode) {
-                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
-                                       &EXT4_I(inode)->jinode);
+                                               EXT4_I(inode)->jinode);
+                jbd2_free_inode(EXT4_I(inode)->jinode);
+                EXT4_I(inode)->jinode = NULL;
+        }
 }
 static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -1155,7 +1161,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path);
+                         struct path *path);
 static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1393,7 +1399,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
                sbi->s_qf_names[qtype] = NULL;
                return 0;
        }
-        set_opt(sbi->s_mount_opt, QUOTA);
+        set_opt(sb, QUOTA);
        return 1;
 }
@@ -1448,21 +1454,21 @@ static int parse_options(char *options, struct super_block *sb,
                switch (token) {
                case Opt_bsd_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, MINIX_DF);
+                        clear_opt(sb, MINIX_DF);
                        break;
                case Opt_minix_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, MINIX_DF);
+                        set_opt(sb, MINIX_DF);
                        break;
                case Opt_grpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, GRPID);
+                        set_opt(sb, GRPID);
                        break;
                case Opt_nogrpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, GRPID);
+                        clear_opt(sb, GRPID);
                        break;
                case Opt_resuid:
@@ -1480,38 +1486,38 @@ static int parse_options(char *options, struct super_block *sb,
                        /* *sb_block = match_int(&args[0]); */
                        break;
                case Opt_err_panic:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt(sb, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_RO);
+                        set_opt(sb, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        set_opt(sb, ERRORS_CONT);
                        break;
                case Opt_nouid32:
-                        set_opt(sbi->s_mount_opt, NO_UID32);
+                        set_opt(sb, NO_UID32);
                        break;
                case Opt_debug:
-                        set_opt(sbi->s_mount_opt, DEBUG);
+                        set_opt(sb, DEBUG);
                        break;
                case Opt_oldalloc:
-                        set_opt(sbi->s_mount_opt, OLDALLOC);
+                        set_opt(sb, OLDALLOC);
                        break;
                case Opt_orlov:
-                        clear_opt(sbi->s_mount_opt, OLDALLOC);
+                        clear_opt(sb, OLDALLOC);
                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
                case Opt_user_xattr:
-                        set_opt(sbi->s_mount_opt, XATTR_USER);
+                        set_opt(sb, XATTR_USER);
                        break;
                case Opt_nouser_xattr:
-                        clear_opt(sbi->s_mount_opt, XATTR_USER);
+                        clear_opt(sb, XATTR_USER);
                        break;
 #else
                case Opt_user_xattr:
@@ -1521,10 +1527,10 @@ static int parse_options(char *options, struct super_block *sb,
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
                case Opt_acl:
-                        set_opt(sbi->s_mount_opt, POSIX_ACL);
+                        set_opt(sb, POSIX_ACL);
                        break;
                case Opt_noacl:
-                        clear_opt(sbi->s_mount_opt, POSIX_ACL);
+                        clear_opt(sb, POSIX_ACL);
                        break;
 #else
                case Opt_acl:
@@ -1543,7 +1549,7 @@ static int parse_options(char *options, struct super_block *sb,
                                         "Cannot specify journal on remount");
                                return 0;
                        }
-                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
+                        set_opt(sb, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
@@ -1556,14 +1562,14 @@ static int parse_options(char *options, struct super_block *sb,
                        *journal_devnum = option;
                        break;
                case Opt_journal_checksum:
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_journal_async_commit:
-                        set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+                        set_opt(sb, JOURNAL_ASYNC_COMMIT);
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_noload:
-                        set_opt(sbi->s_mount_opt, NOLOAD);
+                        set_opt(sb, NOLOAD);
                        break;
                case Opt_commit:
                        if (match_int(&args[0], &option))
@@ -1606,15 +1612,15 @@ static int parse_options(char *options, struct super_block *sb,
                                        return 0;
                                }
                        } else {
-                                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                                clear_opt(sb, DATA_FLAGS);
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
                case Opt_data_err_abort:
-                        set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        set_opt(sb, DATA_ERR_ABORT);
                        break;
                case Opt_data_err_ignore:
-                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        clear_opt(sb, DATA_ERR_ABORT);
                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
@@ -1654,12 +1660,12 @@ set_qf_format:
                        break;
                case Opt_quota:
                case Opt_usrquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, USRQUOTA);
+                        set_opt(sb, USRQUOTA);
                        break;
                case Opt_grpquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, GRPQUOTA);
+                        set_opt(sb, GRPQUOTA);
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
@@ -1667,9 +1673,9 @@ set_qf_format:
                                        "options when quota turned on");
                                return 0;
                        }
-                        clear_opt(sbi->s_mount_opt, QUOTA);
+                        clear_opt(sb, QUOTA);
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                        break;
 #else
                case Opt_quota:
@@ -1695,7 +1701,7 @@ set_qf_format:
                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
                        break;
                case Opt_nobarrier:
-                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        clear_opt(sb, BARRIER);
                        break;
                case Opt_barrier:
                        if (args[0].from) {
@@ -1704,9 +1710,9 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                set_opt(sbi->s_mount_opt, BARRIER);
+                                set_opt(sb, BARRIER);
                        else
-                                clear_opt(sbi->s_mount_opt, BARRIER);
+                                clear_opt(sb, BARRIER);
                        break;
                case Opt_ignore:
                        break;
@@ -1730,17 +1736,17 @@ set_qf_format:
                                 "Ignoring deprecated bh option");
                        break;
                case Opt_i_version:
-                        set_opt(sbi->s_mount_opt, I_VERSION);
+                        set_opt(sb, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
                case Opt_nodelalloc:
-                        clear_opt(sbi->s_mount_opt, DELALLOC);
+                        clear_opt(sb, DELALLOC);
                        break;
                case Opt_mblk_io_submit:
-                        set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        set_opt(sb, MBLK_IO_SUBMIT);
                        break;
                case Opt_nomblk_io_submit:
-                        clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        clear_opt(sb, MBLK_IO_SUBMIT);
                        break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
@@ -1750,13 +1756,13 @@ set_qf_format:
                        sbi->s_stripe = option;
                        break;
                case Opt_delalloc:
-                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        set_opt(sb, DELALLOC);
                        break;
                case Opt_block_validity:
-                        set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        set_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_noblock_validity:
-                        clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        clear_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
@@ -1780,7 +1786,7 @@ set_qf_format:
                                                            option);
                        break;
                case Opt_noauto_da_alloc:
-                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        set_opt(sb, NO_AUTO_DA_ALLOC);
                        break;
                case Opt_auto_da_alloc:
                        if (args[0].from) {
@@ -1789,24 +1795,24 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                                clear_opt(sb, NO_AUTO_DA_ALLOC);
                        else
-                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                                set_opt(sb,NO_AUTO_DA_ALLOC);
                        break;
                case Opt_discard:
-                        set_opt(sbi->s_mount_opt, DISCARD);
+                        set_opt(sb, DISCARD);
                        break;
                case Opt_nodiscard:
-                        clear_opt(sbi->s_mount_opt, DISCARD);
+                        clear_opt(sb, DISCARD);
                        break;
                case Opt_dioread_nolock:
-                        set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        set_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_dioread_lock:
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_init_inode_table:
-                        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        set_opt(sb, INIT_INODE_TABLE);
                        if (args[0].from) {
                                if (match_int(&args[0], &option))
                                        return 0;
@@ -1817,7 +1823,7 @@ set_qf_format:
                        sbi->s_li_wait_mult = option;
                        break;
                case Opt_noinit_inode_table:
-                        clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        clear_opt(sb, INIT_INODE_TABLE);
                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
@@ -1829,10 +1835,10 @@ set_qf_format:
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
                if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
                if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
                        ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1902,12 +1908,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-                                "bpg=%lu, ipg=%lu, mo=%04x]\n",
+                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
-                        sbi->s_mount_opt);
+                        sbi->s_mount_opt, sbi->s_mount_opt2);
        return res;
 }
@@ -1937,14 +1943,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
        size = flex_group_count * sizeof(struct flex_groups);
        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
-                sbi->s_flex_groups = vmalloc(size);
+                sbi->s_flex_groups = vzalloc(size);
-                if (sbi->s_flex_groups)
+                if (sbi->s_flex_groups == NULL) {
-                        memset(sbi->s_flex_groups, 0, size);
+                        ext4_msg(sb, KERN_ERR,
-        }
+                                 "not enough memory for %u flex groups",
-        if (sbi->s_flex_groups == NULL) {
+                                 flex_group_count);
-                ext4_msg(sb, KERN_ERR, "not enough memory for "
+                        goto failed;
-                                "%u flex groups", flex_group_count);
+                }
-                goto failed;
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2923,7 +2928,7 @@ static int ext4_register_li_request(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_li_request *elr;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
-        int ret;
+        int ret = 0;
        if (sbi->s_li_request != NULL)
                return 0;
@@ -3078,41 +3083,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
-                set_opt(sbi->s_mount_opt, DEBUG);
+                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
                ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
                        "2.6.38");
-                set_opt(sbi->s_mount_opt, GRPID);
+                set_opt(sb, GRPID);
        }
        if (def_mount_opts & EXT4_DEFM_UID16)
-                set_opt(sbi->s_mount_opt, NO_UID32);
+                set_opt(sb, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
-                set_opt(sbi->s_mount_opt, XATTR_USER);
+                set_opt(sb, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (def_mount_opts & EXT4_DEFM_ACL)
-                set_opt(sbi->s_mount_opt, POSIX_ACL);
+                set_opt(sb, POSIX_ACL);
 #endif
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-                set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-                set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
-                set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                set_opt(sb, ERRORS_CONT);
        else
-                set_opt(sbi->s_mount_opt, ERRORS_RO);
+                set_opt(sb, ERRORS_RO);
        if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
-                set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
-                set_opt(sbi->s_mount_opt, DISCARD);
+                set_opt(sb, DISCARD);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -3121,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
-                set_opt(sbi->s_mount_opt, BARRIER);
+                set_opt(sb, BARRIER);
        /*
         * enable delayed allocation by default
@@ -3129,7 +3134,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        if (!IS_EXT3_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
-                set_opt(sbi->s_mount_opt, DELALLOC);
+                set_opt(sb, DELALLOC);
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                           &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -3432,8 +3437,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "suppressed and not mounted read-only");
                goto failed_mount_wq;
        } else {
-                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                clear_opt(sb, DATA_FLAGS);
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
@@ -3471,9 +3476,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
-                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                        set_opt(sb, ORDERED_DATA);
                else
-                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                        set_opt(sb, JOURNAL_DATA);
                break;
        case EXT4_MOUNT_ORDERED_DATA:
@@ -3563,18 +3568,18 @@ no_journal:
            (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
                         "requested data journaling mode");
-                clear_opt(sbi->s_mount_opt, DELALLOC);
+                clear_opt(sb, DELALLOC);
        }
        if (test_opt(sb, DIOREAD_NOLOCK)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - requested data journaling mode");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
                if (sb->s_blocksize < PAGE_SIZE) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - block size is too small");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
        }
@@ -3772,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        if (bdev == NULL)
                return NULL;
-        if (bd_claim(bdev, sb)) {
-                ext4_msg(sb, KERN_ERR,
-                        "failed to claim external journal device");
-                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-                return NULL;
-        }
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
@@ -4173,6 +4171,22 @@ static int ext4_unfreeze(struct super_block *sb)
        return 0;
 }
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+        unsigned long s_mount_opt;
+        unsigned long s_mount_opt2;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned long s_commit_interval;
+        u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+        int s_jquota_fmt;
+        char *s_qf_names[MAXQUOTAS];
+#endif
+};
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ext4_super_block *es;
@@ -4193,6 +4207,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
+        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -4346,6 +4361,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
+        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -4542,27 +4558,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name)
+                         struct path *path)
 {
        int err;
-        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb)
-                path_put(&path);
                return -EXDEV;
-        }
        /* Journaling quota? */
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        ext4_msg(sb, KERN_WARNING,
                                "Quota file not on filesystem root. "
                                "Journaled quota will not work");
@@ -4573,7 +4582,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
         * all updates to the file when we bypass pagecache...
         */
        if (EXT4_SB(sb)->s_journal &&
-            ext4_should_journal_data(path.dentry->d_inode)) {
+            ext4_should_journal_data(path->dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -4581,15 +4590,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-                if (err) {
+                if (err)
-                        path_put(&path);
                        return err;
-                }
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        return dquot_quota_on(sb, type, format_id, path);
-        path_put(&path);
-        return err;
 }
 static int ext4_quota_off(struct super_block *sb, int type)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fa4b899da4b3..fc32176eee39 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
 static int
 ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        int i_error, b_error;
+        int ret, ret2;
        down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+        ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
-        if (i_error < 0) {
+        if (ret < 0)
-                b_error = 0;
+                goto errout;
-        } else {
+        if (buffer) {
-                if (buffer) {
+                buffer += ret;
-                        buffer += i_error;
+                buffer_size -= ret;
-                        buffer_size -= i_error;
-                }
-                b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
-                if (b_error < 0)
-                        i_error = 0;
        }
+        ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+        if (ret < 0)
+                goto errout;
+        ret += ret2;
+errout:
        up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        return i_error + b_error;
+        return ret;
 }
 /*
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext4_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index d75a77f85c28..f50408901f7e 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,7 +319,8 @@ extern struct inode *fat_build_inode(struct super_block *sb,
                        struct msdos_dir_entry *de, loff_t i_pos);
 extern int fat_sync_inode(struct inode *inode);
 extern int fat_fill_super(struct super_block *sb, void *data, int silent,
-                        const struct inode_operations *fs_dir_inode_ops, int isvfat);
+                        const struct inode_operations *fs_dir_inode_ops,
+                        int isvfat, void (*setup)(struct super_block *));
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 206351af7c58..86753fe10bd1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -703,7 +703,6 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
 {
        struct inode *inode = NULL;
-        struct dentry *result;
        u32 *fh = fid->raw;
        if (fh_len < 5 || fh_type != 3)
@@ -748,10 +747,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
         * the fat_iget lookup again.  If that fails, then we are totally out
         * of luck.  But all that is for another day
         */
-        result = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(result))
-                d_set_d_op(result, sb->s_root->d_op);
-        return result;
 }
 static int
@@ -799,8 +795,6 @@ static struct dentry *fat_get_parent(struct dentry *child)
        brelse(bh);
        parent = d_obtain_alias(inode);
-        if (!IS_ERR(parent))
-                d_set_d_op(parent, sb->s_root->d_op);
 out:
        unlock_super(sb);
@@ -1244,7 +1238,8 @@ static int fat_read_root(struct inode *inode)
 * Read the super block of an MS-DOS FS.
 */
 int fat_fill_super(struct super_block *sb, void *data, int silent,
-                   const struct inode_operations *fs_dir_inode_ops, int isvfat)
+                   const struct inode_operations *fs_dir_inode_ops, int isvfat,
+                   void (*setup)(struct super_block *))
 {
        struct inode *root_inode = NULL, *fat_inode = NULL;
        struct buffer_head *bh;
@@ -1280,6 +1275,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        if (error)
                goto out_fail;
+        setup(sb); /* flavour-specific stuff that needs options */
        error = -EIO;
        sb_min_blocksize(sb, 512);
        bh = sb_bread(sb, 0);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 35ffe43afa4b..711499040eb6 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -227,11 +227,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        }
 out:
        unlock_super(sb);
-        d_set_d_op(dentry, &msdos_dentry_operations);
+        return d_splice_alias(inode, dentry);
-        dentry = d_splice_alias(inode, dentry);
-        if (dentry)
-                d_set_d_op(dentry, &msdos_dentry_operations);
-        return dentry;
 error:
        unlock_super(sb);
@@ -661,21 +657,16 @@ static const struct inode_operations msdos_dir_inode_operations = {
        .getattr        = fat_getattr,
 };
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
 {
-        int res;
+        sb->s_d_op = &msdos_dentry_operations;
-        lock_super(sb);
-        res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-        if (res) {
-                unlock_super(sb);
-                return res;
-        }
        sb->s_flags |= MS_NOATIME;
-        d_set_d_op(sb->s_root, &msdos_dentry_operations);
+}
-        unlock_super(sb);
-        return 0;
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+        return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
+                             0, setup);
 }
 static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index e3ffc5e12332..f88f752babd9 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -772,13 +772,10 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 out:
        unlock_super(sb);
-        d_set_d_op(dentry, sb->s_root->d_op);
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
-        if (dentry) {
+        if (dentry)
-                d_set_d_op(dentry, sb->s_root->d_op);
                dentry->d_time = dentry->d_parent->d_inode->i_version;
-        }
        return dentry;
 error:
@@ -1066,24 +1063,18 @@ static const struct inode_operations vfat_dir_inode_operations = {
        .getattr        = fat_getattr,
 };
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
 {
-        int res;
-        lock_super(sb);
-        res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-        if (res) {
-                unlock_super(sb);
-                return res;
-        }
        if (MSDOS_SB(sb)->options.name_check != 's')
-                d_set_d_op(sb->s_root, &vfat_ci_dentry_ops);
+                sb->s_d_op = &vfat_ci_dentry_ops;
        else
-                d_set_d_op(sb->s_root, &vfat_dentry_ops);
+                sb->s_d_op = &vfat_dentry_ops;
+}
-        unlock_super(sb);
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
-        return 0;
+{
+        return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
+                             1, setup);
 }
 static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/file_table.c b/fs/file_table.c
index c3dee381f1b4..c3e89adf53c0 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        struct files_struct *files = current->files;
        *fput_needed = 0;
-        if (likely((atomic_read(&files->count) == 1))) {
+        if (atomic_read(&files->count) == 1) {
                file = fcheck_files(files, fd);
        } else {
                rcu_read_lock();
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953aa..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
        return list_entry(head, struct inode, i_wb_list);
 }
-static void bdi_queue_work(struct backing_dev_info *bdi,
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-                struct wb_writeback_work *work)
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
 {
-        trace_writeback_queue(bdi, work);
-        spin_lock_bh(&bdi->wb_lock);
-        list_add_tail(&work->list, &bdi->work_list);
        if (bdi->wb.task) {
                wake_up_process(bdi->wb.task);
        } else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
                 * The bdi thread isn't there, wake up the forker thread which
                 * will create and run it.
                 */
-                trace_writeback_nothread(bdi, work);
                wake_up_process(default_backing_dev_info.wb.task);
        }
+}
+static void bdi_queue_work(struct backing_dev_info *bdi,
+                           struct wb_writeback_work *work)
+{
+        trace_writeback_queue(bdi, work);
+        spin_lock_bh(&bdi->wb_lock);
+        list_add_tail(&work->list, &bdi->work_list);
+        if (!bdi->wb.task)
+                trace_writeback_nothread(bdi, work);
+        bdi_wakeup_flusher(bdi);
        spin_unlock_bh(&bdi->wb_lock);
 }
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                bool range_cyclic, bool for_background)
+                      bool range_cyclic)
 {
        struct wb_writeback_work *work;
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work->sync_mode = WB_SYNC_NONE;
        work->nr_pages  = nr_pages;
        work->range_cyclic = range_cyclic;
-        work->for_background = for_background;
        bdi_queue_work(bdi, work);
 }
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-        __bdi_start_writeback(bdi, nr_pages, true, false);
+        __bdi_start_writeback(bdi, nr_pages, true);
 }
 /**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 * @bdi: the backing device to write from
 *
 * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
- *   started when this function returns, we make no guarentees on
+ *   this function returns, it is only guaranteed that for given BDI
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
 */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-        __bdi_start_writeback(bdi, LONG_MAX, true, true);
+        /*
+         * We just wake up the flusher thread. It will perform background
+         * writeback as soon as there is no other work to do.
+         */
+        trace_writeback_wake_background(bdi);
+        spin_lock_bh(&bdi->wb_lock);
+        bdi_wakeup_flusher(bdi);
+        spin_unlock_bh(&bdi->wb_lock);
 }
 /*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
        };
        unsigned long oldest_jif;
        long wrote = 0;
+        long write_chunk;
        struct inode *inode;
        if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
                wbc.range_end = LLONG_MAX;
        }
+        /*
+         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+         * here avoids calling into writeback_inodes_wb() more than once.
+         *
+         * The intended call sequence for WB_SYNC_ALL writeback is:
+         *
+         *      wb_writeback()
+         *          __writeback_inodes_sb()     <== called only once
+         *              write_cache_pages()     <== called once for each inode
+         *                   (quickly) tag currently dirty pages
+         *                   (maybe slowly) sync all tagged pages
+         */
+        if (wbc.sync_mode == WB_SYNC_NONE)
+                write_chunk = MAX_WRITEBACK_PAGES;
+        else
+                write_chunk = LONG_MAX;
        wbc.wb_start = jiffies; /* livelock avoidance */
        for (;;) {
                /*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                /*
+                 * Background writeout and kupdate-style writeback may
+                 * run forever. Stop them if there is other work to do
+                 * so that e.g. sync can proceed. They'll be restarted
+                 * after the other works are all done.
+                 */
+                if ((work->for_background || work->for_kupdate) &&
+                    !list_empty(&wb->bdi->work_list))
+                        break;
+                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                wbc.more_io = 0;
-                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;
                trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
                        writeback_inodes_wb(wb, &wbc);
                trace_wbc_writeback_written(&wbc, wb->bdi);
-                work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                work->nr_pages -= write_chunk - wbc.nr_to_write;
-                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                wrote += write_chunk - wbc.nr_to_write;
                /*
                 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                /*
                 * Did we write something? Try for more
                 */
-                if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+                if (wbc.nr_to_write < write_chunk)
                        continue;
                /*
                 * Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
                get_nr_dirty_inodes();
 }
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+        if (over_bground_thresh()) {
+                struct wb_writeback_work work = {
+                        .nr_pages       = LONG_MAX,
+                        .sync_mode      = WB_SYNC_NONE,
+                        .for_background = 1,
+                        .range_cyclic   = 1,
+                };
+                return wb_writeback(wb, &work);
+        }
+        return 0;
+}
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
        unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
+        wrote += wb_check_background_flush(wb);
        clear_bit(BDI_writeback_running, &wb->bdi->state);
        return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                __bdi_start_writeback(bdi, nr_pages, false, false);
+                __bdi_start_writeback(bdi, nr_pages, false);
        }
        rcu_read_unlock();
 }
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
 */
 void sync_inodes_sb(struct super_block *sb)
 {
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 EXPORT_SYMBOL(sync_inode);
 /**
- * sync_inode - write an inode to disk
+ * sync_inode_metadata - write an inode to disk
 * @inode: the inode to sync
 * @wait: wait for I/O to complete.
 *
- * Write an inode to disk and adjust it's dirty state after completion.
+ * Write an inode to disk and adjust its dirty state after completion.
 *
 * Note: only writes the actual inode, no associated data or other metadata.
 */
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 68ca487bedb1..78b519c13536 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 #include <linux/fs_struct.h>
+#include "internal.h"
+static inline void path_get_longterm(struct path *path)
+{
+        path_get(path);
+        mnt_make_longterm(path->mnt);
+}
+static inline void path_put_longterm(struct path *path)
+{
+        mnt_make_shortterm(path->mnt);
+        path_put(path);
+}
 /*
 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -17,11 +30,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
        write_seqcount_begin(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
-        path_get_long(path);
+        path_get_longterm(path);
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_root.dentry)
-                path_put_long(&old_root);
+                path_put_longterm(&old_root);
 }
 /*
@@ -36,12 +49,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
        write_seqcount_begin(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
-        path_get_long(path);
+        path_get_longterm(path);
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_pwd.dentry)
-                path_put_long(&old_pwd);
+                path_put_longterm(&old_pwd);
 }
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +72,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                        write_seqcount_begin(&fs->seq);
                        if (fs->root.dentry == old_root->dentry
                            && fs->root.mnt == old_root->mnt) {
-                                path_get_long(new_root);
+                                path_get_longterm(new_root);
                                fs->root = *new_root;
                                count++;
                        }
                        if (fs->pwd.dentry == old_root->dentry
                            && fs->pwd.mnt == old_root->mnt) {
-                                path_get_long(new_root);
+                                path_get_longterm(new_root);
                                fs->pwd = *new_root;
                                count++;
                        }
@@ -76,13 +89,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
        while (count--)
-                path_put_long(old_root);
+                path_put_longterm(old_root);
 }
 void free_fs_struct(struct fs_struct *fs)
 {
-        path_put_long(&fs->root);
+        path_put_longterm(&fs->root);
-        path_put_long(&fs->pwd);
+        path_put_longterm(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
 }
@@ -118,9 +131,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
                spin_lock(&old->lock);
                fs->root = old->root;
-                path_get_long(&fs->root);
+                path_get_longterm(&fs->root);
                fs->pwd = old->pwd;
-                path_get_long(&fs->pwd);
+                path_get_longterm(&fs->pwd);
                spin_unlock(&old->lock);
        }
        return fs;
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede09..48a18f184d50 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
                object->n_ops++;
                object->n_exclusive++;  /* reads and writes must wait */
-                if (object->n_ops > 0) {
+                if (object->n_ops > 1) {
                        atomic_inc(&op->usage);
                        list_add_tail(&op->pend_link, &object->pending_ops);
                        fscache_stat(&fscache_n_op_pend);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6e07696308dc..cf8d28d1fbad 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+                       u64 nodeid, u64 nlookup)
+{
+        forget->forget_one.nodeid = nodeid;
+        forget->forget_one.nlookup = nlookup;
+        spin_lock(&fc->lock);
+        fc->forget_list_tail->next = forget;
+        fc->forget_list_tail = forget;
+        wake_up(&fc->waitq);
+        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+        spin_unlock(&fc->lock);
+}
 static void flush_bg_queue(struct fuse_conn *fc)
 {
        while (fc->active_background < fc->max_background &&
@@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
-{
-        req->isreply = 0;
-        fuse_request_send_nowait(fc, req);
-}
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
@@ -896,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
        return err;
 }
+static int forget_pending(struct fuse_conn *fc)
+{
+        return fc->forget_list_head.next != NULL;
+}
 static int request_pending(struct fuse_conn *fc)
 {
-        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
+                forget_pending(fc);
 }
 /* Wait until a request is available on the pending list */
@@ -960,6 +974,120 @@ __releases(fc->lock)
        return err ? err : reqsize;
 }
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+                                               unsigned max,
+                                               unsigned *countp)
+{
+        struct fuse_forget_link *head = fc->forget_list_head.next;
+        struct fuse_forget_link **newhead = &head;
+        unsigned count;
+        for (count = 0; *newhead != NULL && count < max; count++)
+                newhead = &(*newhead)->next;
+        fc->forget_list_head.next = *newhead;
+        *newhead = NULL;
+        if (fc->forget_list_head.next == NULL)
+                fc->forget_list_tail = &fc->forget_list_head;
+        if (countp != NULL)
+                *countp = count;
+        return head;
+}
+static int fuse_read_single_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs,
+                                   size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+        struct fuse_forget_in arg = {
+                .nlookup = forget->forget_one.nlookup,
+        };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_FORGET,
+                .nodeid = forget->forget_one.nodeid,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        spin_unlock(&fc->lock);
+        kfree(forget);
+        if (nbytes < ih.len)
+                return -EINVAL;
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        unsigned max_forgets;
+        unsigned count;
+        struct fuse_forget_link *head;
+        struct fuse_batch_forget_in arg = { .count = 0 };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_BATCH_FORGET,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        if (nbytes < ih.len) {
+                spin_unlock(&fc->lock);
+                return -EINVAL;
+        }
+        max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+        head = dequeue_forget(fc, max_forgets, &count);
+        spin_unlock(&fc->lock);
+        arg.count = count;
+        ih.len += count * sizeof(struct fuse_forget_one);
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        while (head) {
+                struct fuse_forget_link *forget = head;
+                if (!err) {
+                        err = fuse_copy_one(cs, &forget->forget_one,
+                                            sizeof(forget->forget_one));
+                }
+                head = forget->next;
+                kfree(forget);
+        }
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+                            size_t nbytes)
+__releases(fc->lock)
+{
+        if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+                return fuse_read_single_forget(fc, cs, nbytes);
+        else
+                return fuse_read_batch_forget(fc, cs, nbytes);
+}
 /*
 * Read a single request into the userspace filesystem's buffer.  This
 * function waits until a request is available, then removes it from
@@ -998,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
                return fuse_read_interrupt(fc, cs, nbytes, req);
        }
+        if (forget_pending(fc)) {
+                if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
+                        return fuse_read_forget(fc, cs, nbytes);
+                if (fc->forget_batch <= -8)
+                        fc->forget_batch = 16;
+        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
        req->state = FUSE_REQ_READING;
        list_move(&req->list, &fc->io);
@@ -1090,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1626,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1770,6 +1906,8 @@ __acquires(fc->lock)
        flush_bg_queue(fc);
        end_requests(fc, &fc->pending);
        end_requests(fc, &fc->processing);
+        while (forget_pending(fc))
+                kfree(dequeue_forget(fc, 1, NULL));
 }
 /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f738599fd8cd..bfed8447ed80 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,9 +10,9 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
-#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #if BITS_PER_LONG >= 64
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
@@ -169,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                struct fuse_entry_out outarg;
                struct fuse_conn *fc;
                struct fuse_req *req;
-                struct fuse_req *forget_req;
+                struct fuse_forget_link *forget;
                struct dentry *parent;
                u64 attr_version;
@@ -182,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (IS_ERR(req))
                        return 0;
-                forget_req = fuse_get_req(fc);
+                forget = fuse_alloc_forget();
-                if (IS_ERR(forget_req)) {
+                if (!forget) {
                        fuse_put_request(fc, req);
                        return 0;
                }
@@ -203,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (!err) {
                        struct fuse_inode *fi = get_fuse_inode(inode);
                        if (outarg.nodeid != get_node_id(inode)) {
-                                fuse_send_forget(fc, forget_req,
+                                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
-                                                 outarg.nodeid, 1);
                                return 0;
                        }
                        spin_lock(&fc->lock);
                        fi->nlookup++;
                        spin_unlock(&fc->lock);
                }
-                fuse_put_request(fc, forget_req);
+                kfree(forget);
                if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
                        return 0;
@@ -263,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 {
        struct fuse_conn *fc = get_fuse_conn_super(sb);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        u64 attr_version;
        int err;
@@ -277,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
        if (IS_ERR(req))
                goto out;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        err = PTR_ERR(forget_req);
+        err = -ENOMEM;
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
                goto out;
        }
@@ -305,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
                           attr_version);
        err = -ENOMEM;
        if (!*inode) {
-                fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg->nodeid, 1);
                goto out;
        }
        err = 0;
 out_put_forget:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
 out:
        return err;
 }
@@ -351,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
        }
        entry = newent ? newent : entry;
-        d_set_d_op(entry, &fuse_dentry_operations);
        if (outarg_valid)
                fuse_change_entry_timeout(entry, &outarg);
        else
@@ -378,7 +376,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct inode *inode;
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        struct fuse_create_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
@@ -392,9 +390,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (flags & O_DIRECT)
                return -EINVAL;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req))
+        if (!forget)
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        req = fuse_get_req(fc);
        err = PTR_ERR(req);
@@ -452,10 +450,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!inode) {
                flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
                fuse_sync_release(ff, flags);
-                fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
+                fuse_queue_forget(fc, forget, outentry.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        d_instantiate(entry, inode);
        fuse_change_entry_timeout(entry, &outentry);
        fuse_invalidate_attr(dir);
@@ -473,7 +471,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 out_put_request:
        fuse_put_request(fc, req);
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
@@ -487,12 +485,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        struct fuse_entry_out outarg;
        struct inode *inode;
        int err;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        }
        memset(&outarg, 0, sizeof(outarg));
@@ -519,10 +517,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
                          &outarg.attr, entry_attr_timeout(&outarg), 0);
        if (!inode) {
-                fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *alias;
@@ -545,7 +543,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        return 0;
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8b984a2cebbd..95da1bc1c826 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1634,9 +1634,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 * and 64bit.  Fortunately we can determine which structure the server
 * used from the size of the reply.
 */
-static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
-                                 size_t transferred, unsigned count,
+                                     size_t transferred, unsigned count,
-                                 bool is_compat)
+                                     bool is_compat)
 {
 #ifdef CONFIG_COMPAT
        if (count * sizeof(struct compat_iovec) == transferred) {
@@ -1680,6 +1680,42 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
        return 0;
 }
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+                                 void *src, size_t transferred, unsigned count,
+                                 bool is_compat)
+{
+        unsigned i;
+        struct fuse_ioctl_iovec *fiov = src;
+        if (fc->minor < 16) {
+                return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+                                                 count, is_compat);
+        }
+        if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+                return -EIO;
+        for (i = 0; i < count; i++) {
+                /* Did the server supply an inappropriate value? */
+                if (fiov[i].base != (unsigned long) fiov[i].base ||
+                    fiov[i].len != (unsigned long) fiov[i].len)
+                        return -EIO;
+                dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+                dst[i].iov_len = (size_t) fiov[i].len;
+#ifdef CONFIG_COMPAT
+                if (is_compat &&
+                    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+                     (compat_size_t) dst[i].iov_len != fiov[i].len))
+                        return -EIO;
+#endif
+        }
+        return 0;
+}
 /*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
@@ -1740,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        struct fuse_ioctl_out outarg;
        struct fuse_req *req = NULL;
        struct page **pages = NULL;
-        struct page *iov_page = NULL;
+        struct iovec *iov_page = NULL;
        struct iovec *in_iov = NULL, *out_iov = NULL;
        unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
        size_t in_size, out_size, transferred;
        int err;
+#if BITS_PER_LONG == 32
+        inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+        if (flags & FUSE_IOCTL_COMPAT)
+                inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
        /* assume all the iovs returned by client always fits in a page */
-        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+        BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
        err = -ENOMEM;
        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
-        iov_page = alloc_page(GFP_KERNEL);
+        iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
        if (!pages || !iov_page)
                goto out;
@@ -1760,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
         * RETRY from server is not allowed.
         */
        if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
-                struct iovec *iov = page_address(iov_page);
+                struct iovec *iov = iov_page;
                iov->iov_base = (void __user *)arg;
                iov->iov_len = _IOC_SIZE(cmd);
@@ -1841,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        /* did it ask for retry? */
        if (outarg.flags & FUSE_IOCTL_RETRY) {
-                char *vaddr;
+                void *vaddr;
                /* no retry if in restricted mode */
                err = -EIO;
@@ -1862,14 +1905,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                        goto out;
                vaddr = kmap_atomic(pages[0], KM_USER0);
-                err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
+                err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
                                            transferred, in_iovs + out_iovs,
                                            (flags & FUSE_IOCTL_COMPAT) != 0);
                kunmap_atomic(vaddr, KM_USER0);
                if (err)
                        goto out;
-                in_iov = page_address(iov_page);
+                in_iov = iov_page;
                out_iov = in_iov + in_iovs;
                err = fuse_verify_ioctl_iov(in_iov, in_iovs);
@@ -1891,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 out:
        if (req)
                fuse_put_request(fc, req);
-        if (iov_page)
+        free_page((unsigned long) iov_page);
-                __free_page(iov_page);
        while (num_pages)
                __free_page(pages[--num_pages]);
        kfree(pages);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 57d4a3a0f102..ae5744a2f9e9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -53,6 +53,12 @@ extern struct mutex fuse_mutex;
 extern unsigned max_user_bgreq;
 extern unsigned max_user_congthresh;
+/* One forget request */
+struct fuse_forget_link {
+        struct fuse_forget_one forget_one;
+        struct fuse_forget_link *next;
+};
 /** FUSE inode */
 struct fuse_inode {
        /** Inode data */
@@ -66,7 +72,7 @@ struct fuse_inode {
        u64 nlookup;
        /** The request used for sending the FORGET message */
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        /** Time in jiffies until the file attributes are valid */
        u64 i_time;
@@ -255,7 +261,6 @@ struct fuse_req {
        /** Data for asynchronous requests */
        union {
-                struct fuse_forget_in forget_in;
                struct {
                        struct fuse_release_in in;
                        struct path path;
@@ -369,6 +374,13 @@ struct fuse_conn {
        /** Pending interrupts */
        struct list_head interrupts;
+        /** Queue of pending forgets */
+        struct fuse_forget_link forget_list_head;
+        struct fuse_forget_link *forget_list_tail;
+        /** Batching of FORGET requests (positive indicates FORGET batch) */
+        int forget_batch;
        /** Flag indicating if connection is blocked.  This will be
            the case before the INIT reply is received, and if there
            are too many outstading backgrounds requests */
@@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 /**
 * Send FORGET command
 */
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
-                      u64 nodeid, u64 nlookup);
+                       u64 nodeid, u64 nlookup);
+struct fuse_forget_link *fuse_alloc_forget(void);
 /**
 * Initialize READ or READDIR request
@@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 /**
- * Send a request with no reply
- */
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
-/**
 * Send a request in the background
 */
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a8b31da19b93..9e3f68cc1bd1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,6 +71,11 @@ struct fuse_mount_data {
        unsigned blksize;
 };
+struct fuse_forget_link *fuse_alloc_forget()
+{
+        return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+}
 static struct inode *fuse_alloc_inode(struct super_block *sb)
 {
        struct inode *inode;
@@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&fi->queued_writes);
        INIT_LIST_HEAD(&fi->writepages);
        init_waitqueue_head(&fi->page_waitq);
-        fi->forget_req = fuse_request_alloc();
+        fi->forget = fuse_alloc_forget();
-        if (!fi->forget_req) {
+        if (!fi->forget) {
                kmem_cache_free(fuse_inode_cachep, inode);
                return NULL;
        }
@@ -111,24 +116,10 @@ static void fuse_destroy_inode(struct inode *inode)
        struct fuse_inode *fi = get_fuse_inode(inode);
        BUG_ON(!list_empty(&fi->write_files));
        BUG_ON(!list_empty(&fi->queued_writes));
-        if (fi->forget_req)
+        kfree(fi->forget);
-                fuse_request_free(fi->forget_req);
        call_rcu(&inode->i_rcu, fuse_i_callback);
 }
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
-                      u64 nodeid, u64 nlookup)
-{
-        struct fuse_forget_in *inarg = &req->misc.forget_in;
-        inarg->nlookup = nlookup;
-        req->in.h.opcode = FUSE_FORGET;
-        req->in.h.nodeid = nodeid;
-        req->in.numargs = 1;
-        req->in.args[0].size = sizeof(struct fuse_forget_in);
-        req->in.args[0].value = inarg;
-        fuse_request_send_noreply(fc, req);
-}
 static void fuse_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
@@ -136,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode)
        if (inode->i_sb->s_flags & MS_ACTIVE) {
                struct fuse_conn *fc = get_fuse_conn(inode);
                struct fuse_inode *fi = get_fuse_inode(inode);
-                fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
+                fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
-                fi->forget_req = NULL;
+                fi->forget = NULL;
        }
 }
@@ -541,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc)
        INIT_LIST_HEAD(&fc->interrupts);
        INIT_LIST_HEAD(&fc->bg_queue);
        INIT_LIST_HEAD(&fc->entry);
+        fc->forget_list_tail = &fc->forget_list_head;
        atomic_set(&fc->num_waiting, 0);
        fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
        fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
@@ -625,10 +617,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
                goto out_iput;
        entry = d_obtain_alias(inode);
-        if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
+        if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
-                d_set_d_op(entry, &fuse_dentry_operations);
                fuse_invalidate_entry_cache(entry);
-        }
        return entry;
@@ -727,10 +717,8 @@ static struct dentry *fuse_get_parent(struct dentry *child)
        }
        parent = d_obtain_alias(inode);
-        if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
+        if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
-                d_set_d_op(parent, &fuse_dentry_operations);
                fuse_invalidate_entry_cache(parent);
-        }
        return parent;
 }
@@ -997,6 +985,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
                iput(root);
                goto err_put_conn;
        }
+        /* only now - we want root dentry with NULL ->d_op */
+        sb->s_d_op = &fuse_dentry_operations;
        init_req = fuse_request_alloc();
        if (!init_req)
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 97012ecff560..9023db8184f9 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,12 +126,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 static struct dentry *gfs2_get_parent(struct dentry *child)
 {
-        struct dentry *dentry;
+        return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
-        dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
-        if (!IS_ERR(dentry))
-                d_set_d_op(dentry, &gfs2_dops);
-        return dentry;
 }
 static struct dentry *gfs2_get_dentry(struct super_block *sb,
@@ -139,7 +134,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct inode *inode;
-        struct dentry *dentry;
        inode = gfs2_ilookup(sb, inum->no_addr);
        if (inode) {
@@ -156,10 +150,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
                return ERR_CAST(inode);
 out_inode:
-        dentry = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(dentry))
-                d_set_d_op(dentry, &gfs2_dops);
-        return dentry;
 }
 static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fca6689e12e6..7cfdcb913363 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
+static void empty_write_end(struct page *page, unsigned from,
+                           unsigned to)
+{
+        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        page_zero_new_buffers(page, from, to);
+        flush_dcache_page(page);
+        mark_page_accessed(page);
+        if (!gfs2_is_writeback(ip))
+                gfs2_page_add_databufs(ip, page, from, to);
+        block_commit_write(page, from, to);
+}
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+        unsigned start, end, next;
+        struct buffer_head *bh, *head;
+        int error;
+        if (!page_has_buffers(page)) {
+                error = __block_write_begin(page, from, to - from, gfs2_block_map);
+                if (unlikely(error))
+                        return error;
+                empty_write_end(page, from, to);
+                return 0;
+        }
+        bh = head = page_buffers(page);
+        next = end = 0;
+        while (next < from) {
+                next += bh->b_size;
+                bh = bh->b_this_page;
+        }
+        start = next;
+        do {
+                next += bh->b_size;
+                if (buffer_mapped(bh)) {
+                        if (end) {
+                                error = __block_write_begin(page, start, end - start,
+                                                            gfs2_block_map);
+                                if (unlikely(error))
+                                        return error;
+                                empty_write_end(page, start, end);
+                                end = 0;
+                        }
+                        start = next;
+                }
+                else
+                        end = next;
+                bh = bh->b_this_page;
+        } while (next < to);
+        if (end) {
+                error = __block_write_begin(page, start, end - start, gfs2_block_map);
+                if (unlikely(error))
+                        return error;
+                empty_write_end(page, start, end);
+        }
+        return 0;
+}
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+                           int mode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *dibh;
+        int error;
+        u64 start = offset >> PAGE_CACHE_SHIFT;
+        unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+        u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+        pgoff_t curr;
+        struct page *page;
+        unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+        unsigned int from, to;
+        if (!end_offset)
+                end_offset = PAGE_CACHE_SIZE;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (unlikely(error))
+                goto out;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        if (gfs2_is_stuffed(ip)) {
+                error = gfs2_unstuff_dinode(ip, NULL);
+                if (unlikely(error))
+                        goto out;
+        }
+        curr = start;
+        offset = start << PAGE_CACHE_SHIFT;
+        from = start_offset;
+        to = PAGE_CACHE_SIZE;
+        while (curr <= end) {
+                page = grab_cache_page_write_begin(inode->i_mapping, curr,
+                                                   AOP_FLAG_NOFS);
+                if (unlikely(!page)) {
+                        error = -ENOMEM;
+                        goto out;
+                }
+                if (curr == end)
+                        to = end_offset;
+                error = write_empty_blocks(page, from, to);
+                if (!error && offset + to > inode->i_size &&
+                    !(mode & FALLOC_FL_KEEP_SIZE)) {
+                        i_size_write(inode, offset + to);
+                }
+                unlock_page(page);
+                page_cache_release(page);
+                if (error)
+                        goto out;
+                curr++;
+                offset += PAGE_CACHE_SIZE;
+                from = 0;
+        }
+        gfs2_dinode_out(ip, dibh->b_data);
+        mark_inode_dirty(inode);
+        brelse(dibh);
+out:
+        return error;
+}
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+                            unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+        unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+        for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+                max_data -= tmp;
+        }
+        /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+           so it might end up with fewer data blocks */
+        if (max_data <= *data_blocks)
+                return;
+        *data_blocks = max_data;
+        *ind_blocks = max_blocks - max_data;
+        *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+        if (*len > max) {
+                *len = max;
+                gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+        }
+}
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+                           loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *ip = GFS2_I(inode);
+        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+        loff_t bytes, max_bytes;
+        struct gfs2_alloc *al;
+        int error;
+        loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+        next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
+        offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+                 sdp->sd_sb.sb_bsize_shift;
+        len = next - offset;
+        bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+        if (!bytes)
+                bytes = UINT_MAX;
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+        error = gfs2_glock_nq(&ip->i_gh);
+        if (unlikely(error))
+                goto out_uninit;
+        if (!gfs2_write_alloc_required(ip, offset, len))
+                goto out_unlock;
+        while (len > 0) {
+                if (len < bytes)
+                        bytes = len;
+                al = gfs2_alloc_get(ip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_unlock;
+                }
+                error = gfs2_quota_lock_check(ip);
+                if (error)
+                        goto out_alloc_put;
+retry:
+                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+                al->al_requested = data_blocks + ind_blocks;
+                error = gfs2_inplace_reserve(ip);
+                if (error) {
+                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+                                bytes >>= 1;
+                                goto retry;
+                        }
+                        goto out_qunlock;
+                }
+                max_bytes = bytes;
+                calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+                al->al_requested = data_blocks + ind_blocks;
+                rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+                          RES_RG_HDR + gfs2_rg_blocks(al);
+                if (gfs2_is_jdata(ip))
+                        rblocks += data_blocks ? data_blocks : 1;
+                error = gfs2_trans_begin(sdp, rblocks,
+                                         PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+                if (error)
+                        goto out_trans_fail;
+                error = fallocate_chunk(inode, offset, max_bytes, mode);
+                gfs2_trans_end(sdp);
+                if (error)
+                        goto out_trans_fail;
+                len -= max_bytes;
+                offset += max_bytes;
+                gfs2_inplace_release(ip);
+                gfs2_quota_unlock(ip);
+                gfs2_alloc_put(ip);
+        }
+        goto out_unlock;
+out_trans_fail:
+        gfs2_inplace_release(ip);
+out_qunlock:
+        gfs2_quota_unlock(ip);
+out_alloc_put:
+        gfs2_alloc_put(ip);
+out_unlock:
+        gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+        gfs2_holder_uninit(&ip->i_gh);
+        return error;
+}
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 /**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .setlease       = gfs2_setlease,
+        .fallocate      = gfs2_fallocate,
 };
 const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .setlease       = generic_setlease,
+        .fallocate      = gfs2_fallocate,
 };
 const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8d3d2b4a0a7d..a79790c06275 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
 #define __INCORE_DOT_H__
 #include <linux/fs.h>
+#include <linux/kobject.h>
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2232b3c780bd..7aa7d4f8984a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -74,16 +74,14 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
 }
 /**
- * GFS2 lookup code fills in vfs inode contents based on info obtained
+ * gfs2_set_iop - Sets inode operations
- * from directory entry inside gfs2_inode_lookup(). This has caused issues
+ * @inode: The inode with correct i_mode filled in
- * with NFS code path since its get_dentry routine doesn't have the relevant
- * directory entry when gfs2_inode_lookup() is invoked. Part of the code
- * segment inside gfs2_inode_lookup code needs to get moved around.
 *
- * Clears I_NEW as well.
+ * GFS2 lookup code fills in vfs inode contents based on info obtained
- **/
+ * from directory entry inside gfs2_inode_lookup().
+ */
-void gfs2_set_iop(struct inode *inode)
+static void gfs2_set_iop(struct inode *inode)
 {
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        umode_t mode = inode->i_mode;
@@ -106,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
                inode->i_op = &gfs2_file_iops;
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
        }
-        unlock_new_inode(inode);
 }
 /**
@@ -119,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
 * Returns: A VFS inode, or an error
 */
-struct inode *gfs2_inode_lookup(struct super_block *sb,
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
-                                unsigned int type,
+                                u64 no_addr, u64 no_formal_ino)
-                                u64 no_addr,
-                                u64 no_formal_ino)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
@@ -152,51 +146,37 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
                error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
                if (unlikely(error))
                        goto fail_iopen;
-                ip->i_iopen_gh.gh_gl->gl_object = ip;
+                ip->i_iopen_gh.gh_gl->gl_object = ip;
                gfs2_glock_put(io_gl);
                io_gl = NULL;
-                if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
-                        goto gfs2_nfsbypass;
-                inode->i_mode = DT2IF(type);
-                /*
-                 * We must read the inode in order to work out its type in
-                 * this case. Note that this doesn't happen often as we normally
-                 * know the type beforehand. This code path only occurs during
-                 * unlinked inode recovery (where it is safe to do this glock,
-                 * which is not true in the general case).
-                 */
                if (type == DT_UNKNOWN) {
-                        struct gfs2_holder gh;
+                        /* Inode glock must be locked already */
-                        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+                        error = gfs2_inode_refresh(GFS2_I(inode));
-                        if (unlikely(error))
+                        if (error)
-                                goto fail_glock;
+                                goto fail_refresh;
-                        /* Inode is now uptodate */
+                } else {
-                        gfs2_glock_dq_uninit(&gh);
+                        inode->i_mode = DT2IF(type);
                }
                gfs2_set_iop(inode);
+                unlock_new_inode(inode);
        }
-gfs2_nfsbypass:
        return inode;
-fail_glock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
+fail_refresh:
+        ip->i_iopen_gh.gh_gl->gl_object = NULL;
+        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 fail_iopen:
        if (io_gl)
                gfs2_glock_put(io_gl);
 fail_put:
-        if (inode->i_state & I_NEW)
+        ip->i_gl->gl_object = NULL;
-                ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
 fail:
-        if (inode->i_state & I_NEW)
+        iget_failed(inode);
-                iget_failed(inode);
-        else
-                iput(inode);
        return ERR_PTR(error);
 }
@@ -221,14 +201,6 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
        if (IS_ERR(inode))
                goto fail;
-        error = gfs2_inode_refresh(GFS2_I(inode));
-        if (error)
-                goto fail_iput;
-        /* Pick up the works we bypass in gfs2_inode_lookup */
-        if (inode->i_state & I_NEW) 
-                gfs2_set_iop(inode);
        /* Two extra checks for NFS only */
        if (no_formal_ino) {
                error = -ESTALE;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 732a183efdb3..3e00a66e7cbd 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,7 +96,6 @@ err:
        return -EIO;
 }
-extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                       u64 no_addr, u64 no_formal_ino);
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 2aeabd4218cc..777927ce6f79 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -440,7 +440,6 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
                iput(inode);
                return -ENOMEM;
        }
-        d_set_d_op(dentry, &gfs2_dops);
        *dptr = dentry;
        return 0;
 }
@@ -1106,6 +1105,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        sb->s_magic = GFS2_MAGIC;
        sb->s_op = &gfs2_super_ops;
+        sb->s_d_op = &gfs2_dops;
        sb->s_export_op = &gfs2_export_ops;
        sb->s_xattr = gfs2_xattr_handlers;
        sb->s_qcop = &gfs2_quotactl_ops;
@@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 {
        struct block_device *bdev;
        struct super_block *s;
-        fmode_t mode = FMODE_READ;
+        fmode_t mode = FMODE_READ | FMODE_EXCL;
        int error;
        struct gfs2_args args;
        struct gfs2_sbd *sdp;
@@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
-        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
                return ERR_CAST(bdev);
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
                goto error_bdev;
        if (s->s_root)
-                close_bdev_exclusive(bdev, mode);
+                blkdev_put(bdev, mode);
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
        deactivate_locked_super(s);
        return ERR_PTR(error);
 error_bdev:
-        close_bdev_exclusive(bdev, mode);
+        blkdev_put(bdev, mode);
        return ERR_PTR(error);
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1501db4f0e6d..d8b26ac2e20b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
-#include <linux/swap.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 #include "gfs2.h"
@@ -106,8 +104,6 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 {
        struct inode *inode = NULL;
-        d_set_d_op(dentry, &gfs2_dops);
        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
        if (inode && IS_ERR(inode))
                return ERR_CAST(inode);
@@ -1259,257 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
        return ret;
 }
-static void empty_write_end(struct page *page, unsigned from,
-                           unsigned to)
-{
-        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-        page_zero_new_buffers(page, from, to);
-        flush_dcache_page(page);
-        mark_page_accessed(page);
-        if (!gfs2_is_writeback(ip))
-                gfs2_page_add_databufs(ip, page, from, to);
-        block_commit_write(page, from, to);
-}
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-{
-        unsigned start, end, next;
-        struct buffer_head *bh, *head;
-        int error;
-        if (!page_has_buffers(page)) {
-                error = __block_write_begin(page, from, to - from, gfs2_block_map);
-                if (unlikely(error))
-                        return error;
-                empty_write_end(page, from, to);
-                return 0;
-        }
-        bh = head = page_buffers(page);
-        next = end = 0;
-        while (next < from) {
-                next += bh->b_size;
-                bh = bh->b_this_page;
-        }
-        start = next;
-        do {
-                next += bh->b_size;
-                if (buffer_mapped(bh)) {
-                        if (end) {
-                                error = __block_write_begin(page, start, end - start,
-                                                            gfs2_block_map);
-                                if (unlikely(error))
-                                        return error;
-                                empty_write_end(page, start, end);
-                                end = 0;
-                        }
-                        start = next;
-                }
-                else
-                        end = next;
-                bh = bh->b_this_page;
-        } while (next < to);
-        if (end) {
-                error = __block_write_begin(page, start, end - start, gfs2_block_map);
-                if (unlikely(error))
-                        return error;
-                empty_write_end(page, start, end);
-        }
-        return 0;
-}
-static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
-                           int mode)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct buffer_head *dibh;
-        int error;
-        u64 start = offset >> PAGE_CACHE_SHIFT;
-        unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
-        u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
-        pgoff_t curr;
-        struct page *page;
-        unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
-        unsigned int from, to;
-        if (!end_offset)
-                end_offset = PAGE_CACHE_SIZE;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (unlikely(error))
-                goto out;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        if (gfs2_is_stuffed(ip)) {
-                error = gfs2_unstuff_dinode(ip, NULL);
-                if (unlikely(error))
-                        goto out;
-        }
-        curr = start;
-        offset = start << PAGE_CACHE_SHIFT;
-        from = start_offset;
-        to = PAGE_CACHE_SIZE;
-        while (curr <= end) {
-                page = grab_cache_page_write_begin(inode->i_mapping, curr,
-                                                   AOP_FLAG_NOFS);
-                if (unlikely(!page)) {
-                        error = -ENOMEM;
-                        goto out;
-                }
-                if (curr == end)
-                        to = end_offset;
-                error = write_empty_blocks(page, from, to);
-                if (!error && offset + to > inode->i_size &&
-                    !(mode & FALLOC_FL_KEEP_SIZE)) {
-                        i_size_write(inode, offset + to);
-                }
-                unlock_page(page);
-                page_cache_release(page);
-                if (error)
-                        goto out;
-                curr++;
-                offset += PAGE_CACHE_SIZE;
-                from = 0;
-        }
-        gfs2_dinode_out(ip, dibh->b_data);
-        mark_inode_dirty(inode);
-        brelse(dibh);
-out:
-        return error;
-}
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
-                            unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
-        unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
-        for (tmp = max_data; tmp > sdp->sd_diptrs;) {
-                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-                max_data -= tmp;
-        }
-        /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
-           so it might end up with fewer data blocks */
-        if (max_data <= *data_blocks)
-                return;
-        *data_blocks = max_data;
-        *ind_blocks = max_blocks - max_data;
-        *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
-        if (*len > max) {
-                *len = max;
-                gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
-        }
-}
-static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
-                           loff_t len)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct gfs2_inode *ip = GFS2_I(inode);
-        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-        loff_t bytes, max_bytes;
-        struct gfs2_alloc *al;
-        int error;
-        loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
-        next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
-        offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
-                 sdp->sd_sb.sb_bsize_shift;
-        len = next - offset;
-        bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
-        if (!bytes)
-                bytes = UINT_MAX;
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-        error = gfs2_glock_nq(&ip->i_gh);
-        if (unlikely(error))
-                goto out_uninit;
-        if (!gfs2_write_alloc_required(ip, offset, len))
-                goto out_unlock;
-        while (len > 0) {
-                if (len < bytes)
-                        bytes = len;
-                al = gfs2_alloc_get(ip);
-                if (!al) {
-                        error = -ENOMEM;
-                        goto out_unlock;
-                }
-                error = gfs2_quota_lock_check(ip);
-                if (error)
-                        goto out_alloc_put;
-retry:
-                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
-                al->al_requested = data_blocks + ind_blocks;
-                error = gfs2_inplace_reserve(ip);
-                if (error) {
-                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
-                                bytes >>= 1;
-                                goto retry;
-                        }
-                        goto out_qunlock;
-                }
-                max_bytes = bytes;
-                calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
-                al->al_requested = data_blocks + ind_blocks;
-                rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-                          RES_RG_HDR + gfs2_rg_blocks(al);
-                if (gfs2_is_jdata(ip))
-                        rblocks += data_blocks ? data_blocks : 1;
-                error = gfs2_trans_begin(sdp, rblocks,
-                                         PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
-                if (error)
-                        goto out_trans_fail;
-                error = fallocate_chunk(inode, offset, max_bytes, mode);
-                gfs2_trans_end(sdp);
-                if (error)
-                        goto out_trans_fail;
-                len -= max_bytes;
-                offset += max_bytes;
-                gfs2_inplace_release(ip);
-                gfs2_quota_unlock(ip);
-                gfs2_alloc_put(ip);
-        }
-        goto out_unlock;
-out_trans_fail:
-        gfs2_inplace_release(ip);
-out_qunlock:
-        gfs2_quota_unlock(ip);
-out_alloc_put:
-        gfs2_alloc_put(ip);
-out_unlock:
-        gfs2_glock_dq(&ip->i_gh);
-out_uninit:
-        gfs2_holder_uninit(&ip->i_gh);
-        return error;
-}
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                       u64 start, u64 len)
 {
@@ -1560,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
-        .fallocate = gfs2_fallocate,
        .fiemap = gfs2_fiemap,
 };
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 16c2ecac7eb7..ec73ed70bae1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1336,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
        if (error)
                goto out_truncate;
+        ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
        gfs2_glock_dq_wait(&ip->i_iopen_gh);
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
        error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index ea4aefe7c652..afa66aaa2237 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,8 +25,6 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
        struct inode *inode = NULL;
        int res;
-        d_set_d_op(dentry, &hfs_dentry_operations);
        hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
        hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
        res = hfs_brec_read(&fd, &rec, sizeof(rec));
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 0bef62aa4f42..1b55f704fb22 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -429,13 +429,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!root_inode)
                goto bail_no_root;
+        sb->s_d_op = &hfs_dentry_operations;
        res = -ENOMEM;
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root)
                goto bail_iput;
-        d_set_d_op(sb->s_root, &hfs_dentry_operations);
        /* everything's okay */
        return 0;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f896dc843026..4df5059c25da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,7 +37,6 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
        sb = dir->i_sb;
-        d_set_d_op(dentry, &hfsplus_dentry_operations);
        dentry->d_fsdata = NULL;
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 6ee6ad20acf2..9a3b4795f43c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -444,13 +444,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                err = PTR_ERR(root);
                goto cleanup;
        }
+        sb->s_d_op = &hfsplus_dentry_operations;
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
                iput(root);
                err = -ENOMEM;
                goto cleanup;
        }
-        d_set_d_op(sb->s_root, &hfsplus_dentry_operations);
        str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
        str.name = HFSP_HIDDENDIR_NAME;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index d3244d949a4e..2638c834ed28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -612,7 +612,6 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
                goto out_put;
        d_add(dentry, inode);
-        d_set_d_op(dentry, &hostfs_dentry_ops);
        return NULL;
 out_put:
@@ -922,6 +921,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = HOSTFS_SUPER_MAGIC;
        sb->s_op = &hostfs_sbops;
+        sb->s_d_op = &hostfs_dentry_ops;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        /* NULL is printed as <NULL> by sprintf: avoid that. */
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 32c13a94e1e9..05d4816e4e77 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -58,12 +58,7 @@ static int hpfs_compare_dentry(const struct dentry *parent,
        return 0;
 }
-static const struct dentry_operations hpfs_dentry_operations = {
+const struct dentry_operations hpfs_dentry_operations = {
        .d_hash         = hpfs_hash_dentry,
        .d_compare      = hpfs_compare_dentry,
 };
-void hpfs_set_dentry_operations(struct dentry *dentry)
-{
-        d_set_d_op(dentry, &hpfs_dentry_operations);
-}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2338130cceba..d32f63a569f7 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -298,7 +298,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        end:
        end_add:
-        hpfs_set_dentry_operations(dentry);
        unlock_kernel();
        d_add(dentry, result);
        return NULL;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2fee17d0d9ab..1c43dbea55e8 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -233,7 +233,7 @@ void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
 /* dentry.c */
-void hpfs_set_dentry_operations(struct dentry *);
+extern const struct dentry_operations hpfs_dentry_operations;
 /* dir.c */
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd10..1ae35baa539e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
            attr->ia_size != i_size_read(inode)) {
                error = vmtruncate(inode, attr->ia_size);
                if (error)
-                        return error;
+                        goto out_unlock;
        }
        setattr_copy(inode, attr);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 49935ba78db8..b30426b1fc97 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -550,6 +550,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        /* Fill superblock stuff */
        s->s_magic = HPFS_SUPER_MAGIC;
        s->s_op = &hpfs_sops;
+        s->s_d_op = &hpfs_dentry_operations;
        sbi->sb_root = superblock->root;
        sbi->sb_fs_size = superblock->n_sectors;
@@ -651,7 +652,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                iput(root);
                goto bail0;
        }
-        hpfs_set_dentry_operations(s->s_root);
        /*
         * find the root directory's . pointer & finish filling in the inode
diff --git a/fs/internal.h b/fs/internal.h
index 9687c2ee2735..0663568b1247 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,10 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 extern void release_mounts(struct list_head *);
 extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int finish_automount(struct vfsmount *, struct path *);
+extern void mnt_make_longterm(struct vfsmount *);
+extern void mnt_make_shortterm(struct vfsmount *);
 extern void __init mnt_init(void);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index d6cc16476620..a59635e295fa 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -86,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
                            u64 phys, u64 len, u32 flags)
 {
        struct fiemap_extent extent;
-        struct fiemap_extent *dest = fieinfo->fi_extents_start;
+        struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
        /* only count the extents */
        if (fieinfo->fi_extents_max == 0) {
@@ -173,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
 static int ioctl_fiemap(struct file *filp, unsigned long arg)
 {
        struct fiemap fiemap;
+        struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
        struct fiemap_extent_info fieinfo = { 0, };
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct super_block *sb = inode->i_sb;
@@ -182,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
        if (!inode->i_op->fiemap)
                return -EOPNOTSUPP;
-        if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
+        if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
-                           sizeof(struct fiemap)))
                return -EFAULT;
        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -196,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
        fieinfo.fi_flags = fiemap.fm_flags;
        fieinfo.fi_extents_max = fiemap.fm_extent_count;
-        fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+        fieinfo.fi_extents_start = ufiemap->fm_extents;
        if (fiemap.fm_extent_count != 0 &&
            !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -209,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
        error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
        fiemap.fm_flags = fieinfo.fi_flags;
        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
-        if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+        if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
                error = -EFAULT;
        return error;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 844a7903c72f..a0f3833c0dbf 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -939,17 +939,18 @@ root_found:
                goto out_iput;
        }
-        /* get the root dentry */
-        s->s_root = d_alloc_root(inode);
-        if (!(s->s_root))
-                goto out_no_root;
        table = 0;
        if (joliet_level)
                table += 2;
        if (opt.check == 'r')
                table++;
-        d_set_d_op(s->s_root, &isofs_dentry_ops[table]);
+        s->s_d_op = &isofs_dentry_ops[table];
+        /* get the root dentry */
+        s->s_root = d_alloc_root(inode);
+        if (!(s->s_root))
+                goto out_no_root;
        kfree(opt.iocharset);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 679a849c3b27..4fb3e8074fd4 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -172,8 +172,6 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
        struct inode *inode;
        struct page *page;
-        d_set_d_op(dentry, dir->i_sb->s_root->d_op);
        page = alloc_page(GFP_USER);
        if (!page)
                return ERR_PTR(-ENOMEM);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 846a3f314111..5b2e4c30a2a1 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -207,7 +207,7 @@ repeat_locked:
         * the committing transaction.  Really, we only need to give it
         * committing_transaction->t_outstanding_credits plus "enough" for
         * the log control blocks.
-         * Also, this test is inconsitent with the matching one in
+         * Also, this test is inconsistent with the matching one in
         * journal_extend().
         */
        if (__log_space_left(journal) < jbd_space_needed(journal)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f837ba953529..9e4686900f18 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
 #include <linux/bitops.h>
+#include <linux/ratelimit.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -93,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_inode_cache);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -827,7 +829,7 @@ static journal_t * journal_init_common (void)
        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
        if (!journal)
-                goto fail;
+                return NULL;
        init_waitqueue_head(&journal->j_wait_transaction_locked);
        init_waitqueue_head(&journal->j_wait_logspace);
@@ -852,14 +854,12 @@ static journal_t * journal_init_common (void)
        err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
        if (err) {
                kfree(journal);
-                goto fail;
+                return NULL;
        }
        spin_lock_init(&journal->j_history_lock);
        return journal;
-fail:
-        return NULL;
 }
 /* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -1982,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
        struct journal_head *ret;
-        static unsigned long last_warning;
 #ifdef CONFIG_JBD2_DEBUG
        atomic_inc(&nr_journal_heads);
@@ -1990,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void)
        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
        if (!ret) {
                jbd_debug(1, "out of memory for journal_head\n");
-                if (time_after(jiffies, last_warning + 5*HZ)) {
+                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
-                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-                               __func__);
-                        last_warning = jiffies;
-                }
                while (!ret) {
                        yield();
                        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -2292,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 #endif
-struct kmem_cache *jbd2_handle_cache;
+struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 static int __init journal_init_handle_cache(void)
 {
-        jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
+        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
-                                sizeof(handle_t),
-                                0,              /* offset */
-                                SLAB_TEMPORARY, /* flags */
-                                NULL);          /* ctor */
        if (jbd2_handle_cache == NULL) {
-                printk(KERN_EMERG "JBD: failed to create handle cache\n");
+                printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+                return -ENOMEM;
+        }
+        jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+        if (jbd2_inode_cache == NULL) {
+                printk(KERN_EMERG "JBD2: failed to create inode cache\n");
+                kmem_cache_destroy(jbd2_handle_cache);
                return -ENOMEM;
        }
        return 0;
@@ -2312,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
 {
        if (jbd2_handle_cache)
                kmem_cache_destroy(jbd2_handle_cache);
+        if (jbd2_inode_cache)
+                kmem_cache_destroy(jbd2_inode_cache);
 }
 /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2bc4d5f116f1..1cad869494f0 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD2_DEBUG
                int dropped = info.end_transaction - 
                        be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
+#endif
                journal->j_transaction_sequence = ++info.end_transaction;
        }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6bf0a242613e..faad2bd787c7 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -251,7 +251,7 @@ repeat:
         * the committing transaction.  Really, we only need to give it
         * committing_transaction->t_outstanding_credits plus "enough" for
         * the log control blocks.
-         * Also, this test is inconsitent with the matching one in
+         * Also, this test is inconsistent with the matching one in
         * jbd2_journal_extend().
         */
        if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
@@ -340,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
                jbd2_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
-                goto out;
        }
-out:
        return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -589,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        transaction = handle->h_transaction;
        journal = transaction->t_journal;
-        jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
        JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -774,7 +772,7 @@ done:
                J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
                            "Possible IO failure.\n");
                page = jh2bh(jh)->b_page;
-                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+                offset = offset_in_page(jh2bh(jh)->b_data);
                source = kmap_atomic(page, KM_USER0);
                /* Fire data frozen trigger just before we copy the data */
                jbd2_buffer_frozen_trigger(jh, source + offset,
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 85c6be2db02f..3005ec4520ad 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
        size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
 #ifndef __ECOS
        if (jffs2_blocks_use_vmalloc(c))
-                c->blocks = vmalloc(size);
+                c->blocks = vzalloc(size);
        else
 #endif
-                c->blocks = kmalloc(size, GFP_KERNEL);
+                c->blocks = kzalloc(size, GFP_KERNEL);
        if (!c->blocks)
                return -ENOMEM;
-        memset(c->blocks, 0, size);
        for (i=0; i<c->nr_blocks; i++) {
                INIT_LIST_HEAD(&c->blocks[i].list);
                c->blocks[i].offset = i * c->sector_size;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index f864005de64c..0bc6a6c80a56 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -144,4 +144,4 @@ struct jffs2_sb_info {
        void *os_priv;
 };
-#endif /* _JFFS2_FB_SB */
+#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a49..4f9cc0482949 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
                JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
                            offset, je32_to_cpu(rx.hdr_crc), crc);
                xd->flags |= JFFS2_XFLAGS_INVALID;
-                return EIO;
+                return -EIO;
        }
        totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
        if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
                            je32_to_cpu(rx.xid), xd->xid,
                            je32_to_cpu(rx.version), xd->version);
                xd->flags |= JFFS2_XFLAGS_INVALID;
-                return EIO;
+                return -EIO;
        }
        xd->xprefix = rx.xprefix;
        xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
                              ref_offset(xd->node), xd->data_crc, crc);
                kfree(data);
                xd->flags |= JFFS2_XFLAGS_INVALID;
-                return EIO;
+                return -EIO;
        }
        xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
        if (xd->xname)
                return 0;
        if (xd->flags & JFFS2_XFLAGS_INVALID)
-                return EIO;
+                return -EIO;
        if (unlikely(is_xattr_datum_unchecked(c, xd)))
                rc = do_verify_xattr_datum(c, xd);
        if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
        if (crc != je32_to_cpu(rr.node_crc)) {
                JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
                            offset, je32_to_cpu(rr.node_crc), crc);
-                return EIO;
+                return -EIO;
        }
        if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
            || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
                            offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
                            je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
                            je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
-                return EIO;
+                return -EIO;
        }
        ref->ino = je32_to_cpu(rr.ino);
        ref->xid = je32_to_cpu(rr.xid);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e1b8493b9aaa..278e3fb40b71 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
         * file systems to log may have n-to-1 relationship;
         */
-        bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
+        bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                 log);
        if (IS_ERR(bdev)) {
                rc = -PTR_ERR(bdev);
                goto free;
        }
-        if ((rc = bd_claim(bdev, log))) {
-                goto close;
-        }
        log->bdev = bdev;
        memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
@@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
         * initialize log:
         */
        if ((rc = lmLogInit(log)))
-                goto unclaim;
+                goto close;
        list_add(&log->journal_list, &jfs_external_logs);
@@ -1163,11 +1160,8 @@ journal_found:
        list_del(&log->journal_list);
        lbmLogShutdown(log);
-      unclaim:
-        bd_release(bdev);
      close:            /* close external log device */
-        blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
      free:             /* free log descriptor */
        mutex_unlock(&jfs_log_mutex);
@@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
        bdev = log->bdev;
        rc = lmLogShutdown(log);
-        bd_release(bdev);
+        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-        blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
        kfree(log);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4414e3a42264..81ead850ddb6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1465,9 +1465,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
        jfs_info("jfs_lookup: name = %s", name);
-        if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
-                d_set_d_op(dentry, &jfs_ci_dentry_operations);
        if ((name[0] == '.') && (len == 1))
                inum = dip->i_ino;
        else if (strcmp(name, "..") == 0)
@@ -1492,12 +1489,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
                return ERR_CAST(ip);
        }
-        dentry = d_splice_alias(ip, dentry);
+        return d_splice_alias(ip, dentry);
-        if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
-                d_set_d_op(dentry, &jfs_ci_dentry_operations);
-        return dentry;
 }
 static struct inode *jfs_nfs_get_inode(struct super_block *sb,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3150d766e0d4..eeca48a031ab 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -515,6 +515,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = JFS_SUPER_MAGIC;
+        if (sbi->mntflag & JFS_OS2)
+                sb->s_d_op = &jfs_ci_dentry_operations;
        inode = jfs_iget(sb, ROOT_I);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
@@ -524,9 +527,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sb->s_root)
                goto out_no_root;
-        if (sbi->mntflag & JFS_OS2)
-                d_set_d_op(sb->s_root, &jfs_ci_dentry_operations);
        /* logical blocks are represented by 40 bits in pxd_t, etc. */
        sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
 #if BITS_PER_LONG == 32
diff --git a/fs/libfs.c b/fs/libfs.c
index 889311e3d06b..c88eab55aec9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,8 @@ static const struct super_operations simple_super_operations = {
 * will never be mountable)
 */
 struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
-        const struct super_operations *ops, unsigned long magic)
+        const struct super_operations *ops,
+        const struct dentry_operations *dops, unsigned long magic)
 {
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        struct dentry *dentry;
@@ -254,6 +255,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
        dentry->d_parent = dentry;
        d_instantiate(dentry, root);
        s->s_root = dentry;
+        s->s_d_op = dops;
        s->s_flags |= MS_ACTIVE;
        return dget(s->s_root);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 97f6073ab339..ca58d64374ca 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_LOCKD) += lockd.o
-lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
+lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
-                svcproc.o svcsubs.o mon.o xdr.o grace.o
+                svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
-lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
 lockd-objs                    := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 000000000000..f848b52c67b1
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
+/*
+ * linux/fs/lockd/clnt4xdr.c
+ *
+ * XDR functions to encode/decode NLM version 4 RPC arguments and results.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM4_void_sz            (0)
+#define NLM4_cookie_sz          (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM4_caller_sz          (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_owner_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_fhandle_sz         (1+(NFS3_FHSIZE>>2))
+#define NLM4_lock_sz            (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
+#define NLM4_holder_sz          (6+NLM4_owner_sz)
+#define NLM4_testargs_sz        (NLM4_cookie_sz+1+NLM4_lock_sz)
+#define NLM4_lockargs_sz        (NLM4_cookie_sz+4+NLM4_lock_sz)
+#define NLM4_cancargs_sz        (NLM4_cookie_sz+2+NLM4_lock_sz)
+#define NLM4_unlockargs_sz      (NLM4_cookie_sz+NLM4_lock_sz)
+#define NLM4_testres_sz         (NLM4_cookie_sz+1+NLM4_holder_sz)
+#define NLM4_res_sz             (NLM4_cookie_sz+1)
+#define NLM4_norep_sz           (0)
+static s64 loff_t_to_s64(loff_t offset)
+{
+        s64 res;
+        if (offset >= NLM4_OFFSET_MAX)
+                res = NLM4_OFFSET_MAX;
+        else if (offset <= -NLM4_OFFSET_MAX)
+                res = -NLM4_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm4_compute_offsets(const struct nlm_lock *lock,
+                                 u64 *l_offset, u64 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s64(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv4 basic data types
+ *
+ * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
+ * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
+ * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                             struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size > NFS3_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, fh->size);
+}
+/*
+ *      enum nlm4_stats {
+ *              NLM4_GRANTED = 0,
+ *              NLM4_DENIED = 1,
+ *              NLM4_DENIED_NOLOCKS = 2,
+ *              NLM4_BLOCKED = 3,
+ *              NLM4_DENIED_GRACE_PERIOD = 4,
+ *              NLM4_DEADLCK = 5,
+ *              NLM4_ROFS = 6,
+ *              NLM4_STALE_FH = 7,
+ *              NLM4_FBIG = 8,
+ *              NLM4_FAILED = 9
+ *      };
+ *
+ *      struct nlm4_stat {
+ *              nlm4_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm4_stat(struct xdr_stream *xdr,
+                             const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm4_failed))
+                goto out_bad_xdr;
+        *stat = *p;
+        return 0;
+out_bad_xdr:
+        dprintk("%s: server returned invalid nlm4_stats value: %u\n",
+                        __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm4_holder {
+ *              bool    exclusive;
+ *              int32   svid;
+ *              netobj  oh;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_holder(struct xdr_stream *xdr,
+                               const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u64 l_offset, l_len;
+        u32 exclusive;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 8 + 8);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        p = xdr_decode_hyper(p, &l_offset);
+        xdr_decode_hyper(p, &l_len);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm4_lock {
+ *              string  caller_name<LM_MAXSTRLEN>;
+ *              netobj  fh;
+ *              netobj  oh;
+ *              int32   svid;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_lock(struct xdr_stream *xdr,
+                             const struct nlm_lock *lock)
+{
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 8 + 8);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+/*
+ * NLMv4 XDR encode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm4_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm4_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_unlockargs {
+ *              netobj cookie;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static void nlm4_xdr_enc_res(struct rpc_rqst *req,
+                             struct xdr_stream *xdr,
+                             const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+}
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+        if (result->status == nlm_lck_denied)
+                encode_nlm4_holder(xdr, result);
+}
+/*
+ * NLMv4 XDR decode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static int decode_nlm4_testrply(struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm4_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm4_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static int nlm4_xdr_dec_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm4_xdr_dec_norep      NULL
+#define PROC(proc, argtype, restype)                                    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,             \
+        .p_decode    = (kxdrdproc_t)nlm4_xdr_dec_##restype,             \
+        .p_arglen    = NLM4_##argtype##_sz,                             \
+        .p_replen    = NLM4_##restype##_sz,                             \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm4_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+struct rpc_version      nlm_version4 = {
+        .number         = 4,
+        .nrprocs        = ARRAY_SIZE(nlm4_procedures),
+        .procs          = nlm4_procedures,
+};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 25509eb28fd7..8d4ea8351e3d 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
 */
 void nlmclnt_done(struct nlm_host *host)
 {
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -273,7 +273,7 @@ restart:
        spin_unlock(&nlm_blocked_lock);
        /* Release host handle after use */
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
        return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 332c54cf75e0..adb45ec9038c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -58,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
                return;
        list_del(&lockowner->list);
        spin_unlock(&lockowner->host->h_lock);
-        nlm_release_host(lockowner->host);
+        nlmclnt_release_host(lockowner->host);
        kfree(lockowner);
 }
@@ -207,22 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
                printk("nlm_alloc_call: failed, waiting for memory\n");
                schedule_timeout_interruptible(5*HZ);
        }
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        return NULL;
 }
-void nlm_release_call(struct nlm_rqst *call)
+void nlmclnt_release_call(struct nlm_rqst *call)
 {
        if (!atomic_dec_and_test(&call->a_count))
                return;
-        nlm_release_host(call->a_host);
+        nlmclnt_release_host(call->a_host);
        nlmclnt_release_lockargs(call);
        kfree(call);
 }
 static void nlmclnt_rpc_release(void *data)
 {
-        nlm_release_call(data);
+        nlmclnt_release_call(data);
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -436,7 +436,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
                        status = nlm_stat_to_errno(req->a_res.status);
        }
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -593,7 +593,7 @@ again:
 out_unblock:
        nlmclnt_finish_block(block);
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 out_unlock:
        /* Fatal error: ensure that we remove the lock altogether */
@@ -694,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
        /* What to do now? I'm out of my depth... */
        status = -ENOLCK;
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -755,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
                        NLMPROC_CANCEL, &nlmclnt_cancel_ops);
        if (status == 0 && req->a_res.status == nlm_lck_denied)
                status = -ENOLCK;
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 000000000000..180ac34feb9a
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
+/*
+ * linux/fs/lockd/clntxdr.c
+ *
+ * XDR functions to encode/decode NLM version 3 RPC arguments and results.
+ * NLM version 3 is backwards compatible with NLM versions 1 and 2.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM_cookie_sz           (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM_caller_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_owner_sz            (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_fhandle_sz          (1+(NFS2_FHSIZE>>2))
+#define NLM_lock_sz             (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
+#define NLM_holder_sz           (4+NLM_owner_sz)
+#define NLM_testargs_sz         (NLM_cookie_sz+1+NLM_lock_sz)
+#define NLM_lockargs_sz         (NLM_cookie_sz+4+NLM_lock_sz)
+#define NLM_cancargs_sz         (NLM_cookie_sz+2+NLM_lock_sz)
+#define NLM_unlockargs_sz       (NLM_cookie_sz+NLM_lock_sz)
+#define NLM_testres_sz          (NLM_cookie_sz+1+NLM_holder_sz)
+#define NLM_res_sz              (NLM_cookie_sz+1)
+#define NLM_norep_sz            (0)
+static s32 loff_t_to_s32(loff_t offset)
+{
+        s32 res;
+        if (offset >= NLM_OFFSET_MAX)
+                res = NLM_OFFSET_MAX;
+        else if (offset <= -NLM_OFFSET_MAX)
+                res = -NLM_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm_compute_offsets(const struct nlm_lock *lock,
+                                u32 *l_offset, u32 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s32(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv3 basic data types
+ *
+ * Basic NLMv3 data types are not defined in an IETF standards
+ * document.  X/Open has a description of these data types that
+ * is useful.  See Chapter 10 of "Protocols for Interworking:
+ * XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                         struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size != NFS2_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
+}
+/*
+ *      enum nlm_stats {
+ *              LCK_GRANTED = 0,
+ *              LCK_DENIED = 1,
+ *              LCK_DENIED_NOLOCKS = 2,
+ *              LCK_BLOCKED = 3,
+ *              LCK_DENIED_GRACE_PERIOD = 4
+ *      };
+ *
+ *
+ *      struct nlm_stat {
+ *              nlm_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm_stat(struct xdr_stream *xdr,
+                            const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm_stat(struct xdr_stream *xdr,
+                           __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm_lck_denied_grace_period))
+                goto out_enum;
+        *stat = *p;
+        return 0;
+out_enum:
+        dprintk("%s: server returned invalid nlm_stats value: %u\n",
+                __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm_holder {
+ *              bool exclusive;
+ *              int uppid;
+ *              netobj oh;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_holder(struct xdr_stream *xdr,
+                              const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u32 exclusive, l_offset, l_len;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        l_offset = be32_to_cpup(p++);
+        l_len = be32_to_cpup(p);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm_lock {
+ *              string caller_name<LM_MAXSTRLEN>;
+ *              netobj fh;
+ *              netobj oh;
+ *              int uppid;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_lock(struct xdr_stream *xdr,
+                            const struct nlm_lock *lock)
+{
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+/*
+ * NLMv3 XDR encode functions
+ *
+ * NLMv3 argument types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_unlockargs {
+ *              netobj cookie;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static void nlm_xdr_enc_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+}
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static void encode_nlm_testrply(struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        if (result->status == nlm_lck_denied)
+                encode_nlm_holder(xdr, result);
+}
+static void nlm_xdr_enc_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+        encode_nlm_testrply(xdr, result);
+}
+/*
+ * NLMv3 XDR decode functions
+ *
+ * NLMv3 result types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static int decode_nlm_testrply(struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm_xdr_dec_testres(struct rpc_rqst *req,
+                               struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static int nlm_xdr_dec_res(struct rpc_rqst *req,
+                           struct xdr_stream *xdr,
+                           struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm_xdr_dec_norep       NULL
+#define PROC(proc, argtype, restype)    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,              \
+        .p_decode    = (kxdrdproc_t)nlm_xdr_dec_##restype,              \
+        .p_arglen    = NLM_##argtype##_sz,                              \
+        .p_replen    = NLM_##restype##_sz,                              \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+static struct rpc_version       nlm_version1 = {
+                .number         = 1,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       nlm_version3 = {
+                .number         = 3,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       *nlm_versions[] = {
+        [1] = &nlm_version1,
+        [3] = &nlm_version3,
+#ifdef CONFIG_LOCKD_V4
+        [4] = &nlm_version4,
+#endif
+};
+static struct rpc_stat          nlm_rpc_stats;
+struct rpc_program              nlm_program = {
+                .name           = "lockd",
+                .number         = NLM_PROGRAM,
+                .nrvers         = ARRAY_SIZE(nlm_versions),
+                .version        = nlm_versions,
+                .stats          = &nlm_rpc_stats,
+};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index ed0c59fe23ce..5f1bcb2f06f3 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -25,9 +25,22 @@
 #define NLM_HOST_EXPIRE         (300 * HZ)
 #define NLM_HOST_COLLECT        (120 * HZ)
-static struct hlist_head        nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_server_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_client_hosts[NLM_HOST_NRHASH];
+#define for_each_host(host, pos, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry((host), (pos), (chain), h_hash)
+#define for_each_host_safe(host, pos, next, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry_safe((host), (pos), (next), \
+                                                (chain), h_hash)
 static unsigned long            next_gc;
-static int                      nrhosts;
+static unsigned long            nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
@@ -40,8 +53,6 @@ struct nlm_lookup_host_info {
        const u32               version;        /* NLM version to search for */
        const char              *hostname;      /* remote's hostname */
        const size_t            hostname_len;   /* it's length */
-        const struct sockaddr   *src_sap;       /* our address (optional) */
-        const size_t            src_len;        /* it's length */
        const int               noresvport;     /* use non-priv port */
 };
@@ -88,127 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
 }
 /*
- * Common host lookup routine for server & client
+ * Allocate and initialize an nlm_host.  Common to both client and server.
 */
-static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
+static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
+                                       struct nsm_handle *nsm)
 {
-        struct hlist_head *chain;
+        struct nlm_host *host = NULL;
-        struct hlist_node *pos;
+        unsigned long now = jiffies;
-        struct nlm_host *host;
-        struct nsm_handle *nsm = NULL;
-        mutex_lock(&nlm_host_mutex);
-        if (time_after_eq(jiffies, next_gc))
+        if (nsm != NULL)
-                nlm_gc_hosts();
-        /* We may keep several nlm_host objects for a peer, because each
-         * nlm_host is identified by
-         * (address, protocol, version, server/client)
-         * We could probably simplify this a little by putting all those
-         * different NLM rpc_clients into one single nlm_host object.
-         * This would allow us to have one nlm_host per address.
-         */
-        chain = &nlm_hosts[nlm_hash_address(ni->sap)];
-        hlist_for_each_entry(host, pos, chain, h_hash) {
-                if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
-                        continue;
-                /* See if we have an NSM handle for this client */
-                if (!nsm)
-                        nsm = host->h_nsmhandle;
-                if (host->h_proto != ni->protocol)
-                        continue;
-                if (host->h_version != ni->version)
-                        continue;
-                if (host->h_server != ni->server)
-                        continue;
-                if (ni->server && ni->src_len != 0 &&
-                    !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
-                        continue;
-                /* Move to head of hash chain. */
-                hlist_del(&host->h_hash);
-                hlist_add_head(&host->h_hash, chain);
-                nlm_get_host(host);
-                dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
-                                host->h_name, host->h_addrbuf);
-                goto out;
-        }
-        /*
-         * The host wasn't in our hash table.  If we don't
-         * have an NSM handle for it yet, create one.
-         */
-        if (nsm)
                atomic_inc(&nsm->sm_count);
        else {
                host = NULL;
                nsm = nsm_get_handle(ni->sap, ni->salen,
                                        ni->hostname, ni->hostname_len);
-                if (!nsm) {
+                if (unlikely(nsm == NULL)) {
-                        dprintk("lockd: nlm_lookup_host failed; "
+                        dprintk("lockd: %s failed; no nsm handle\n",
-                                "no nsm handle\n");
+                                __func__);
                        goto out;
                }
        }
-        host = kzalloc(sizeof(*host), GFP_KERNEL);
+        host = kmalloc(sizeof(*host), GFP_KERNEL);
-        if (!host) {
+        if (unlikely(host == NULL)) {
+                dprintk("lockd: %s failed; no memory\n", __func__);
                nsm_release(nsm);
-                dprintk("lockd: nlm_lookup_host failed; no memory\n");
                goto out;
        }
-        host->h_name       = nsm->sm_name;
-        host->h_addrbuf    = nsm->sm_addrbuf;
        memcpy(nlm_addr(host), ni->sap, ni->salen);
-        host->h_addrlen = ni->salen;
+        host->h_addrlen    = ni->salen;
        rpc_set_port(nlm_addr(host), 0);
-        memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+        host->h_srcaddrlen = 0;
-        host->h_srcaddrlen = ni->src_len;
+        host->h_rpcclnt    = NULL;
+        host->h_name       = nsm->sm_name;
        host->h_version    = ni->version;
        host->h_proto      = ni->protocol;
-        host->h_rpcclnt    = NULL;
+        host->h_reclaiming = 0;
-        mutex_init(&host->h_mutex);
+        host->h_server     = ni->server;
-        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
+        host->h_noresvport = ni->noresvport;
-        host->h_expires    = jiffies + NLM_HOST_EXPIRE;
+        host->h_inuse      = 0;
-        atomic_set(&host->h_count, 1);
        init_waitqueue_head(&host->h_gracewait);
        init_rwsem(&host->h_rwsem);
-        host->h_state      = 0;                 /* pseudo NSM state */
+        host->h_state      = 0;
-        host->h_nsmstate   = 0;                 /* real NSM state */
+        host->h_nsmstate   = 0;
-        host->h_nsmhandle  = nsm;
+        host->h_pidcount   = 0;
-        host->h_server     = ni->server;
+        atomic_set(&host->h_count, 1);
-        host->h_noresvport = ni->noresvport;
+        mutex_init(&host->h_mutex);
-        hlist_add_head(&host->h_hash, chain);
+        host->h_nextrebind = now + NLM_HOST_REBIND;
+        host->h_expires    = now + NLM_HOST_EXPIRE;
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
        INIT_LIST_HEAD(&host->h_granted);
        INIT_LIST_HEAD(&host->h_reclaim);
+        host->h_nsmhandle  = nsm;
-        nrhosts++;
+        host->h_addrbuf    = nsm->sm_addrbuf;
-        dprintk("lockd: nlm_lookup_host created host %s\n",
-                        host->h_name);
 out:
-        mutex_unlock(&nlm_host_mutex);
        return host;
 }
 /*
- * Destroy a host
+ * Destroy an nlm_host and free associated resources
+ *
+ * Caller must hold nlm_host_mutex.
 */
-static void
+static void nlm_destroy_host_locked(struct nlm_host *host)
-nlm_destroy_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
+        dprintk("lockd: destroy host %s\n", host->h_name);
        BUG_ON(!list_empty(&host->h_lockowners));
        BUG_ON(atomic_read(&host->h_count));
+        hlist_del_init(&host->h_hash);
        nsm_unmonitor(host);
        nsm_release(host->h_nsmhandle);
@@ -216,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host)
        if (clnt != NULL)
                rpc_shutdown_client(clnt);
        kfree(host);
+        nrhosts--;
 }
 /**
@@ -249,12 +218,76 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .hostname_len   = strlen(hostname),
                .noresvport     = noresvport,
        };
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host;
+        struct nsm_handle *nsm = NULL;
        dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
                        (hostname ? hostname : "<none>"), version,
                        (protocol == IPPROTO_UDP ? "udp" : "tcp"));
-        return nlm_lookup_host(&ni);
+        mutex_lock(&nlm_host_mutex);
+        chain = &nlm_client_hosts[nlm_hash_address(sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != protocol)
+                        continue;
+                if (host->h_version != version)
+                        continue;
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n", __func__,
+                        host->h_name, host->h_addrbuf);
+                goto out;
+        }
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n", __func__,
+                host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmclnt_release_host - release client nlm_host
+ * @host: nlm_host to release
+ *
+ */
+void nlmclnt_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release client host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(host->h_server);
+        if (atomic_dec_and_test(&host->h_count)) {
+                BUG_ON(!list_empty(&host->h_lockowners));
+                BUG_ON(!list_empty(&host->h_granted));
+                BUG_ON(!list_empty(&host->h_reclaim));
+                mutex_lock(&nlm_host_mutex);
+                nlm_destroy_host_locked(host);
+                mutex_unlock(&nlm_host_mutex);
+        }
 }
 /**
@@ -279,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                                    const char *hostname,
                                    const size_t hostname_len)
 {
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host = NULL;
+        struct nsm_handle *nsm = NULL;
        struct sockaddr_in sin = {
                .sin_family     = AF_INET,
        };
        struct sockaddr_in6 sin6 = {
                .sin6_family    = AF_INET6,
        };
+        struct sockaddr *src_sap;
+        size_t src_len = rqstp->rq_addrlen;
        struct nlm_lookup_host_info ni = {
                .server         = 1,
                .sap            = svc_addr(rqstp),
@@ -293,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                .version        = rqstp->rq_vers,
                .hostname       = hostname,
                .hostname_len   = hostname_len,
-                .src_len        = rqstp->rq_addrlen,
        };
        dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
                        (int)hostname_len, hostname, rqstp->rq_vers,
                        (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+        mutex_lock(&nlm_host_mutex);
        switch (ni.sap->sa_family) {
        case AF_INET:
                sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
-                ni.src_sap = (struct sockaddr *)&sin;
+                src_sap = (struct sockaddr *)&sin;
                break;
        case AF_INET6:
                ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
-                ni.src_sap = (struct sockaddr *)&sin6;
+                src_sap = (struct sockaddr *)&sin6;
                break;
        default:
-                return NULL;
+                dprintk("lockd: %s failed; unrecognized address family\n",
+                        __func__);
+                goto out;
+        }
+        if (time_after_eq(jiffies, next_gc))
+                nlm_gc_hosts();
+        chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != ni.protocol)
+                        continue;
+                if (host->h_version != ni.version)
+                        continue;
+                if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
+                        continue;
+                /* Move to head of hash chain. */
+                hlist_del(&host->h_hash);
+                hlist_add_head(&host->h_hash, chain);
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n",
+                        __func__, host->h_name, host->h_addrbuf);
+                goto out;
        }
-        return nlm_lookup_host(&ni);
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        memcpy(nlm_srcaddr(host), src_sap, src_len);
+        host->h_srcaddrlen = src_len;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n",
+                __func__, host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmsvc_release_host - release server nlm_host
+ * @host: nlm_host to release
+ *
+ * Host is destroyed later in nlm_gc_host().
+ */
+void nlmsvc_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release server host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(!host->h_server);
+        atomic_dec(&host->h_count);
 }
 /*
@@ -413,20 +516,28 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
        return host;
 }
-/*
+static struct nlm_host *next_host_state(struct hlist_head *cache,
- * Release NLM host after use
+                                        struct nsm_handle *nsm,
- */
+                                        const struct nlm_reboot *info)
-void nlm_release_host(struct nlm_host *host)
 {
-        if (host != NULL) {
+        struct nlm_host *host = NULL;
-                dprintk("lockd: release host %s\n", host->h_name);
+        struct hlist_head *chain;
-                BUG_ON(atomic_read(&host->h_count) < 0);
+        struct hlist_node *pos;
-                if (atomic_dec_and_test(&host->h_count)) {
-                        BUG_ON(!list_empty(&host->h_lockowners));
+        mutex_lock(&nlm_host_mutex);
-                        BUG_ON(!list_empty(&host->h_granted));
+        for_each_host(host, pos, chain, cache) {
-                        BUG_ON(!list_empty(&host->h_reclaim));
+                if (host->h_nsmhandle == nsm
+                    && host->h_nsmstate != info->state) {
+                        host->h_nsmstate = info->state;
+                        host->h_state++;
+                        nlm_get_host(host);
+                        goto out;
                }
        }
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
 }
 /**
@@ -438,8 +549,6 @@ void nlm_release_host(struct nlm_host *host)
 */
 void nlm_host_rebooted(const struct nlm_reboot *info)
 {
-        struct hlist_head *chain;
-        struct hlist_node *pos;
        struct nsm_handle *nsm;
        struct nlm_host *host;
@@ -452,32 +561,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
         * lock for this.
         * To avoid processing a host several times, we match the nsmstate.
         */
-again:  mutex_lock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                nlmsvc_free_host_resources(host);
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                nlmsvc_release_host(host);
-                        if (host->h_nsmhandle == nsm
-                         && host->h_nsmstate != info->state) {
-                                host->h_nsmstate = info->state;
-                                host->h_state++;
-                                nlm_get_host(host);
-                                mutex_unlock(&nlm_host_mutex);
-                                if (host->h_server) {
-                                        /* We're server for this guy, just ditch
-                                         * all the locks he held. */
-                                        nlmsvc_free_host_resources(host);
-                                } else {
-                                        /* He's the server, initiate lock recovery. */
-                                        nlmclnt_recovery(host);
-                                }
-                                nlm_release_host(host);
-                                goto again;
-                        }
-                }
        }
-        mutex_unlock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
+                nlmclnt_recovery(host);
+                nlmclnt_release_host(host);
+        }
        nsm_release(nsm);
 }
@@ -497,13 +589,11 @@ nlm_shutdown_hosts(void)
        /* First, make all hosts eligible for gc */
        dprintk("lockd: nuking all hosts...\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts) {
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                host->h_expires = jiffies - 1;
-                        host->h_expires = jiffies - 1;
+                if (host->h_rpcclnt) {
-                        if (host->h_rpcclnt) {
+                        rpc_shutdown_client(host->h_rpcclnt);
-                                rpc_shutdown_client(host->h_rpcclnt);
+                        host->h_rpcclnt = NULL;
-                                host->h_rpcclnt = NULL;
-                        }
                }
        }
@@ -512,15 +602,13 @@ nlm_shutdown_hosts(void)
        mutex_unlock(&nlm_host_mutex);
        /* complain if any hosts are left */
-        if (nrhosts) {
+        if (nrhosts != 0) {
                printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
-                dprintk("lockd: %d hosts left:\n", nrhosts);
+                dprintk("lockd: %lu hosts left:\n", nrhosts);
-                for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                for_each_host(host, pos, chain, nlm_server_hosts) {
-                        hlist_for_each_entry(host, pos, chain, h_hash) {
+                        dprintk("       %s (cnt %d use %d exp %ld)\n",
-                                dprintk("       %s (cnt %d use %d exp %ld)\n",
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_inuse, host->h_expires);
-                                        host->h_inuse, host->h_expires);
-                        }
                }
        }
 }
@@ -538,29 +626,22 @@ nlm_gc_hosts(void)
        struct nlm_host *host;
        dprintk("lockd: host garbage collection\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts)
-                hlist_for_each_entry(host, pos, chain, h_hash)
+                host->h_inuse = 0;
-                        host->h_inuse = 0;
-        }
        /* Mark all hosts that hold locks, blocks or shares */
        nlmsvc_mark_resources();
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
-                hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
+                if (atomic_read(&host->h_count) || host->h_inuse
-                        if (atomic_read(&host->h_count) || host->h_inuse
+                 || time_before(jiffies, host->h_expires)) {
-                         || time_before(jiffies, host->h_expires)) {
+                        dprintk("nlm_gc_hosts skipping %s "
-                                dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
+                                "(cnt %d use %d exp %ld)\n",
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_inuse, host->h_expires);
+                                host->h_inuse, host->h_expires);
-                                continue;
+                        continue;
-                        }
-                        dprintk("lockd: delete host %s\n", host->h_name);
-                        hlist_del_init(&host->h_hash);
-                        nlm_destroy_host(host);
-                        nrhosts--;
                }
+                nlm_destroy_host_locked(host);
        }
        next_gc = jiffies + NLM_HOST_COLLECT;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e0c918949644..23d7451b2938 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -401,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm)
 * Status Monitor wire protocol.
 */
-static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
+static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
        const u32 len = strlen(string);
        __be32 *p;
-        if (unlikely(len > SM_MAXSTRLEN))
+        BUG_ON(len > SM_MAXSTRLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, string, len);
-        return 0;
 }
 /*
 * "mon_name" specifies the host to be monitored.
 */
-static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        return encode_nsm_string(xdr, argp->mon_name);
+        encode_nsm_string(xdr, argp->mon_name);
 }
 /*
@@ -429,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
 * has changed.
 */
-static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
        __be32 *p;
-        status = encode_nsm_string(xdr, utsname()->nodename);
+        encode_nsm_string(xdr, utsname()->nodename);
-        if (unlikely(status != 0))
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-                return status;
+        *p++ = cpu_to_be32(argp->prog);
-        p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+        *p++ = cpu_to_be32(argp->vers);
-        if (unlikely(p == NULL))
+        *p = cpu_to_be32(argp->proc);
-                return -EIO;
-        *p++ = htonl(argp->prog);
-        *p++ = htonl(argp->vers);
-        *p++ = htonl(argp->proc);
-        return 0;
 }
 /*
 * The "mon_id" argument specifies the non-private arguments
 * of an NSMPROC_MON or NSMPROC_UNMON call.
 */
-static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
+        encode_mon_name(xdr, argp);
+        encode_my_id(xdr, argp);
-        status = encode_mon_name(xdr, argp);
-        if (unlikely(status != 0))
-                return status;
-        return encode_my_id(xdr, argp);
 }
 /*
@@ -465,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 * by the NSMPROC_MON call. This information will be supplied in the
 * NLMPROC_SM_NOTIFY call.
 */
-static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
        __be32 *p;
        p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
-        return 0;
 }
-static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                       const struct nsm_args *argp)
+                            const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        int status;
+        encode_priv(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        status = encode_mon_id(&xdr, argp);
-        if (unlikely(status))
-                return status;
-        return encode_priv(&xdr, argp);
 }
-static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                         const struct nsm_args *argp)
+                              const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mon_id(&xdr, argp);
 }
-static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
-                            struct nsm_res *resp)
+                                struct xdr_stream *xdr,
+                                struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->status = ntohl(*p++);
+        resp->status = be32_to_cpup(p++);
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
+        dprintk("lockd: %s status %d state %d\n",
-                        resp->status, resp->state);
+                __func__, resp->status, resp->state);
        return 0;
 }
-static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
-                        struct nsm_res *resp)
+                            struct xdr_stream *xdr,
+                            struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4);
-        p = xdr_inline_decode(&xdr, sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
+        dprintk("lockd: %s state %d\n", __func__, resp->state);
        return 0;
 }
@@ -542,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
 static struct rpc_procinfo      nsm_procedures[] = {
 [NSMPROC_MON] = {
                .p_proc         = NSMPROC_MON,
-                .p_encode       = (kxdrproc_t)xdr_enc_mon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_mon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat_res,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat_res,
                .p_arglen       = SM_mon_sz,
                .p_replen       = SM_monres_sz,
                .p_statidx      = NSMPROC_MON,
@@ -551,8 +525,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
        },
 [NSMPROC_UNMON] = {
                .p_proc         = NSMPROC_UNMON,
-                .p_encode       = (kxdrproc_t)xdr_enc_unmon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_unmon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat,
                .p_arglen       = SM_mon_id_sz,
                .p_replen       = SM_unmonres_sz,
                .p_statidx      = NSMPROC_UNMON,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 38d261192453..9a41fdc19511 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -51,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;   
        return nlm_lck_denied_nolocks;
@@ -92,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -134,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -164,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -197,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unlock(file, &argp->lock);
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -229,7 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 static void nlm4svc_callback_release(void *data)
 {
-        nlm_release_call(data);
+        nlmsvc_release_call(data);
 }
 static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -261,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -334,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_share_file(host, file, argp);
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -367,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unshare_file(host, file, argp);
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -399,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ef5659b211e9..6e31695d046f 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -46,6 +46,7 @@ static void	nlmsvc_remove_block(struct nlm_block *block);
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
 static const struct rpc_call_ops nlmsvc_grant_ops;
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
 /*
 * The list of blocked locks to retry
@@ -233,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 failed_free:
        kfree(block);
 failed:
-        nlm_release_call(call);
+        nlmsvc_release_call(call);
        return NULL;
 }
@@ -266,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
        mutex_unlock(&file->f_mutex);
        nlmsvc_freegrantargs(block->b_call);
-        nlm_release_call(block->b_call);
+        nlmsvc_release_call(block->b_call);
        nlm_release_file(block->b_file);
        kfree(block->b_fl);
        kfree(block);
@@ -934,3 +935,32 @@ nlmsvc_retry_blocked(void)
        return timeout;
 }
+#ifdef RPC_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+        /*
+         * We can get away with a static buffer because we're only
+         * called with BKL held.
+         */
+        static char buf[2*NLM_MAXCOOKIELEN+1];
+        unsigned int i, len = sizeof(buf);
+        char *p = buf;
+        len--;  /* allow for trailing \0 */
+        if (len < 3)
+                return "???";
+        for (i = 0 ; i < cookie->len ; i++) {
+                if (len < 2) {
+                        strcpy(p-3, "...");
+                        break;
+                }
+                sprintf(p, "%02x", cookie->data[i]);
+                p += 2;
+                len -= 2;
+        }
+        *p = '\0';
+        return buf;
+}
+#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0caea5310ac3..d27aab11f324 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -80,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;
        return nlm_lck_denied_nolocks;
@@ -122,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
                dprintk("lockd: TEST          status %d vers %d\n",
                        ntohl(resp->status), rqstp->rq_vers);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -164,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -194,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -227,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -257,9 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
                        -task->tk_status);
 }
+void nlmsvc_release_call(struct nlm_rqst *call)
+{
+        if (!atomic_dec_and_test(&call->a_count))
+                return;
+        nlmsvc_release_host(call->a_host);
+        kfree(call);
+}
 static void nlmsvc_callback_release(void *data)
 {
-        nlm_release_call(data);
+        nlmsvc_release_call(data);
 }
 static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -291,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -366,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_share_file(host, file, argp));
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -399,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -431,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index b583ab0a4cbb..964666c68a86 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s32                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm_encode_fh(p, &lock->fh))
-         || !(p = nlm_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM_OFFSET_MAX
-         || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        start = loff_t_to_s32(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
-        *p++ = htonl(lock->svid);
-        *p++ = htonl(start);
-        *p++ = htonl(len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                s32                     start, len, end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                start = ntohl(*p++);
-                len = ntohl(*p++);
-                end = start + len - 1;
-                fl->fl_start = s32_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s32_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM_void_sz             0
-#define NLM_cookie_sz           1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_owner_sz            1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_fhandle_sz          1+XDR_QUADLEN(NFS2_FHSIZE)
-#define NLM_lock_sz             3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
-#define NLM_holder_sz           4+NLM_owner_sz
-#define NLM_testargs_sz         NLM_cookie_sz+1+NLM_lock_sz
-#define NLM_lockargs_sz         NLM_cookie_sz+4+NLM_lock_sz
-#define NLM_cancargs_sz         NLM_cookie_sz+2+NLM_lock_sz
-#define NLM_unlockargs_sz       NLM_cookie_sz+NLM_lock_sz
-#define NLM_testres_sz          NLM_cookie_sz+1+NLM_holder_sz
-#define NLM_res_sz              NLM_cookie_sz+1
-#define NLM_norep_sz            0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlmclt_decode_norep     NULL
-#define PROC(proc, argtype, restype)    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,            \
-        .p_decode    = (kxdrproc_t) nlmclt_decode_##restype,            \
-        .p_arglen    = NLM_##argtype##_sz,                              \
-        .p_replen    = NLM_##restype##_sz,                              \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-static struct rpc_version       nlm_version1 = {
-                .number         = 1,
-                .nrprocs        = 16,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version       nlm_version3 = {
-                .number         = 3,
-                .nrprocs        = 24,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version *     nlm_versions[] = {
-        [1] = &nlm_version1,
-        [3] = &nlm_version3,
-#ifdef  CONFIG_LOCKD_V4
-        [4] = &nlm_version4,
-#endif
-};
-static struct rpc_stat          nlm_stats;
-struct rpc_program              nlm_program = {
-                .name           = "lockd",
-                .number         = NLM_PROGRAM,
-                .nrvers         = ARRAY_SIZE(nlm_versions),
-                .version        = nlm_versions,
-                .stats          = &nlm_stats,
-};
-#ifdef RPC_DEBUG
-const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
-{
-        /*
-         * We can get away with a static buffer because we're only
-         * called with BKL held.
-         */
-        static char buf[2*NLM_MAXCOOKIELEN+1];
-        unsigned int i, len = sizeof(buf);
-        char *p = buf;
-        len--;  /* allow for trailing \0 */
-        if (len < 3)
-                return "???";
-        for (i = 0 ; i < cookie->len ; i++) {
-                if (len < 2) {
-                        strcpy(p-3, "...");
-                        break;
-                }
-                sprintf(p, "%02x", cookie->data[i]);
-                p += 2;
-                len -= 2;
-        }
-        *p = '\0';
-        return buf;
-}
-#endif
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ad9dbbc9145d..dfa4789cd460 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(f->size);
 }
-static __be32 *
-nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-        *p++ = htonl(f->size);
-        if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
-        memcpy(p, f->data, f->size);
-        return p + XDR_QUADLEN(f->size);
-}
 /*
 * Encode and decode owner handle
 */
@@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
 }
 static __be32 *
-nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
-{
-        return xdr_encode_netobj(p, oh);
-}
-static __be32 *
 nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
        struct file_lock        *fl = &lock->fl;
@@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s64                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm4_encode_fh(p, &lock->fh))
-         || !(p = nlm4_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM4_OFFSET_MAX
-         || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        *p++ = htonl(lock->svid);
-        start = loff_t_to_s64(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
-        p = xdr_encode_hyper(p, start);
-        p = xdr_encode_hyper(p, len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                __u64                   start, len;
-                __s64                   end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                p = xdr_decode_hyper(p, &start);
-                p = xdr_decode_hyper(p, &len);
-                end = start + len - 1;
-                fl->fl_start = s64_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s64_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
-#  error "NLM host name cannot be larger than NLM's maximum string length!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM4_void_sz            0
-#define NLM4_cookie_sz          1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM4_caller_sz          1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_owner_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_fhandle_sz         1+XDR_QUADLEN(NFS3_FHSIZE)
-#define NLM4_lock_sz            5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
-#define NLM4_holder_sz          6+NLM4_owner_sz
-#define NLM4_testargs_sz        NLM4_cookie_sz+1+NLM4_lock_sz
-#define NLM4_lockargs_sz        NLM4_cookie_sz+4+NLM4_lock_sz
-#define NLM4_cancargs_sz        NLM4_cookie_sz+2+NLM4_lock_sz
-#define NLM4_unlockargs_sz      NLM4_cookie_sz+NLM4_lock_sz
-#define NLM4_testres_sz         NLM4_cookie_sz+1+NLM4_holder_sz
-#define NLM4_res_sz             NLM4_cookie_sz+1
-#define NLM4_norep_sz           0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlm4clt_decode_norep    NULL
-#define PROC(proc, argtype, restype)                                    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,           \
-        .p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,           \
-        .p_arglen    = NLM4_##argtype##_sz,                             \
-        .p_replen    = NLM4_##restype##_sz,                             \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm4_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-struct rpc_version      nlm_version4 = {
-        .number         = 4,
-        .nrprocs        = 24,
-        .procs          = nlm4_procedures,
-};
diff --git a/fs/locks.c b/fs/locks.c
index 08415b2a6d36..0f3998291f78 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -444,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
        fl->fl_file->f_owner.signum = 0;
 }
-static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
-{
-        return fl->fl_file == try->fl_file;
-}
 static const struct lock_manager_operations lease_manager_ops = {
        .fl_break = lease_break_callback,
        .fl_release_private = lease_release_private_callback,
-        .fl_mylease = lease_mylease_callback,
        .fl_change = lease_modify,
 };
@@ -1405,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
        for (before = &inode->i_flock;
                        ((fl = *before) != NULL) && IS_LEASE(fl);
                        before = &fl->fl_next) {
-                if (lease->fl_lmops->fl_mylease(fl, lease))
+                if (fl->fl_file == filp)
                        my_before = before;
                else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
                        /*
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 92ca6fbe09bd..723bc5bca09a 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
 static void bdev_put_device(struct logfs_super *s)
 {
-        close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
+        blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -325,13 +325,14 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
 {
        struct block_device *bdev;
-        bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+        bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                  type);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
        if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
                int mtdnr = MINOR(bdev->bd_dev);
-                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+                blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
                return logfs_get_sb_mtd(p, mtdnr);
        }
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237b..a25444ab2baf 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
 EXPORT_SYMBOL(mb_cache_entry_find_next);
 #endif
-struct mb_cache {
-        struct list_head                c_cache_list;
-        const char                      *c_name;
-        atomic_t                        c_entry_count;
-        int                             c_max_entries;
-        int                             c_bucket_bits;
-        struct kmem_cache               *c_entry_cache;
-        struct list_head                *c_block_hash;
-        struct list_head                *c_index_hash;
-};
 /*
 * Global data: list of all mbcache's, lru list, and a spinlock for
 * accessing cache data structures on SMP machines. The lru list is
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 1b9e07728a9f..ce7337ddfdbf 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,8 +23,6 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        d_set_d_op(dentry, dir->i_sb->s_root->d_op);
        if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
                return ERR_PTR(-ENAMETOOLONG);
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
 * status of that page is hard.  See end_buffer_async_read() for the details.
 * There is no point in duplicating all that complexity.
 */
-static void mpage_end_io_read(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
                if (--bvec >= bio->bi_io_vec)
                        prefetchw(&bvec->bv_page->flags);
+                if (bio_data_dir(bio) == READ) {
-                if (uptodate) {
+                        if (uptodate) {
-                        SetPageUptodate(page);
+                                SetPageUptodate(page);
-                } else {
+                        } else {
-                        ClearPageUptodate(page);
+                                ClearPageUptodate(page);
-                        SetPageError(page);
+                                SetPageError(page);
-                }
+                        }
-                unlock_page(page);
+                        unlock_page(page);
-        } while (bvec >= bio->bi_io_vec);
+                } else { /* bio_data_dir(bio) == WRITE */
-        bio_put(bio);
+                        if (!uptodate) {
-}
+                                SetPageError(page);
+                                if (page->mapping)
-static void mpage_end_io_write(struct bio *bio, int err)
+                                        set_bit(AS_EIO, &page->mapping->flags);
-{
+                        }
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+                        end_page_writeback(page);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        do {
-                struct page *page = bvec->bv_page;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (!uptodate){
-                        SetPageError(page);
-                        if (page->mapping)
-                                set_bit(AS_EIO, &page->mapping->flags);
                }
-                end_page_writeback(page);
        } while (bvec >= bio->bi_io_vec);
        bio_put(bio);
 }
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
-        bio->bi_end_io = mpage_end_io_read;
+        bio->bi_end_io = mpage_end_io;
-        if (rw == WRITE)
-                bio->bi_end_io = mpage_end_io_write;
        submit_bio(rw, bio);
        return NULL;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 19433cdba011..7d77f24d32a9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -202,7 +202,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
 * @inode:      inode to check access rights for
 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 * @check_acl:  optional callback to check for Posix ACLs
- * @flags       IPERM_FLAG_ flags.
+ * @flags:      IPERM_FLAG_ flags.
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
@@ -368,18 +368,6 @@ void path_get(struct path *path)
 EXPORT_SYMBOL(path_get);
 /**
- * path_get_long - get a long reference to a path
- * @path: path to get the reference to
- *
- * Given a path increment the reference count to the dentry and the vfsmount.
- */
-void path_get_long(struct path *path)
-{
-        mntget_long(path->mnt);
-        dget(path->dentry);
-}
-/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
@@ -393,21 +381,9 @@ void path_put(struct path *path)
 EXPORT_SYMBOL(path_put);
 /**
- * path_put_long - put a long reference to a path
- * @path: path to put the reference to
- *
- * Given a path decrement the reference count to the dentry and the vfsmount.
- */
-void path_put_long(struct path *path)
-{
-        dput(path->dentry);
-        mntput_long(path->mnt);
-}
-/**
 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
 * @nd: nameidata pathwalk data to drop
- * @Returns: 0 on success, -ECHLID on failure
+ * Returns: 0 on success, -ECHILD on failure
 *
 * Path walking has 2 modes, rcu-walk and ref-walk (see
 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
@@ -468,7 +444,7 @@ static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
 * @nd: nameidata pathwalk data to drop
 * @dentry: dentry to drop
- * @Returns: 0 on success, -ECHLID on failure
+ * Returns: 0 on success, -ECHILD on failure
 *
 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
@@ -479,6 +455,14 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
+        /*
+         * It can be possible to revalidate the dentry that we started
+         * the path walk with. force_reval_path may also revalidate the
+         * dentry already committed to the nameidata.
+         */
+        if (unlikely(parent == dentry))
+                return nameidata_drop_rcu(nd);
        BUG_ON(!(nd->flags & LOOKUP_RCU));
        if (nd->root.mnt) {
                spin_lock(&fs->lock);
@@ -530,7 +514,7 @@ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct d
 /**
 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
 * @nd: nameidata pathwalk data to drop
- * @Returns: 0 on success, -ECHLID on failure
+ * Returns: 0 on success, -ECHILD on failure
 *
 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
 * nd->path should be the final element of the lookup, so nd->root is discarded.
@@ -583,6 +567,13 @@ void release_open_intent(struct nameidata *nd)
                fput(nd->intent.open.file);
 }
+/*
+ * Call d_revalidate and handle filesystems that request rcu-walk
+ * to be dropped. This may be called and return in rcu-walk mode,
+ * regardless of success or error. If -ECHILD is returned, the caller
+ * must return -ECHILD back up the path walk stack so path walk may
+ * be restarted in ref-walk mode.
+ */
 static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        int status;
@@ -673,6 +664,9 @@ force_reval_path(struct path *path, struct nameidata *nd)
                return 0;
        if (!status) {
+                /* Don't d_invalidate in rcu-walk mode */
+                if (nameidata_drop_rcu(nd))
+                        return -ECHILD;
                d_invalidate(dentry);
                status = -ESTALE;
        }
@@ -761,7 +755,8 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
                mntput(path->mnt);
 }
-static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
+static inline void path_to_nameidata(const struct path *path,
+                                        struct nameidata *nd)
 {
        if (!(nd->flags & LOOKUP_RCU)) {
                dput(nd->path.dentry);
@@ -773,20 +768,16 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 }
 static __always_inline int
-__do_follow_link(struct path *path, struct nameidata *nd, void **p)
+__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 {
        int error;
-        struct dentry *dentry = path->dentry;
+        struct dentry *dentry = link->dentry;
-        touch_atime(path->mnt, dentry);
+        touch_atime(link->mnt, dentry);
        nd_set_link(nd, NULL);
-        if (path->mnt != nd->path.mnt) {
+        if (link->mnt == nd->path.mnt)
-                path_to_nameidata(path, nd);
+                mntget(link->mnt);
-                nd->inode = nd->path.dentry->d_inode;
-                dget(dentry);
-        }
-        mntget(path->mnt);
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -877,54 +868,148 @@ int follow_up(struct path *path)
 }
 /*
- * serialization is taken care of in namespace.c
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ *   were called with.
 */
-static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
+static int follow_automount(struct path *path, unsigned flags,
-                                struct inode **inode)
+                            bool *need_mntput)
 {
-        while (d_mountpoint(path->dentry)) {
+        struct vfsmount *mnt;
-                struct vfsmount *mounted;
+        int err;
-                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
-                if (!mounted)
+        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
-                        return;
+                return -EREMOTE;
-                path->mnt = mounted;
-                path->dentry = mounted->mnt_root;
+        /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
-                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+         * and this is the terminal part of the path.
-                *inode = path->dentry->d_inode;
+         */
+        if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
+                return -EISDIR; /* we actually want to stop here */
+        /* We want to mount if someone is trying to open/create a file of any
+         * type under the mountpoint, wants to traverse through the mountpoint
+         * or wants to open the mounted directory.
+         *
+         * We don't want to mount if someone's just doing a stat and they've
+         * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
+         * appended a '/' to the name.
+         */
+        if (!(flags & LOOKUP_FOLLOW) &&
+            !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+                       LOOKUP_OPEN | LOOKUP_CREATE)))
+                return -EISDIR;
+        current->total_link_count++;
+        if (current->total_link_count >= 40)
+                return -ELOOP;
+        mnt = path->dentry->d_op->d_automount(path);
+        if (IS_ERR(mnt)) {
+                /*
+                 * The filesystem is allowed to return -EISDIR here to indicate
+                 * it doesn't want to automount.  For instance, autofs would do
+                 * this so that its userspace daemon can mount on this dentry.
+                 *
+                 * However, we can only permit this if it's a terminal point in
+                 * the path being looked up; if it wasn't then the remainder of
+                 * the path is inaccessible and we should say so.
+                 */
+                if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+                        return -EREMOTE;
+                return PTR_ERR(mnt);
        }
-}
-static int __follow_mount(struct path *path)
+        if (!mnt) /* mount collision */
-{
+                return 0;
-        int res = 0;
-        while (d_mountpoint(path->dentry)) {
+        err = finish_automount(mnt, path);
-                struct vfsmount *mounted = lookup_mnt(path);
-                if (!mounted)
+        switch (err) {
-                        break;
+        case -EBUSY:
+                /* Someone else made a mount here whilst we were busy */
+                return 0;
+        case 0:
                dput(path->dentry);
-                if (res)
+                if (*need_mntput)
                        mntput(path->mnt);
-                path->mnt = mounted;
+                path->mnt = mnt;
-                path->dentry = dget(mounted->mnt_root);
+                path->dentry = dget(mnt->mnt_root);
-                res = 1;
+                *need_mntput = true;
+                return 0;
+        default:
+                return err;
        }
-        return res;
 }
-static void follow_mount(struct path *path)
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
 {
-        while (d_mountpoint(path->dentry)) {
+        unsigned managed;
-                struct vfsmount *mounted = lookup_mnt(path);
+        bool need_mntput = false;
-                if (!mounted)
+        int ret;
-                        break;
-                dput(path->dentry);
+        /* Given that we're not holding a lock here, we retain the value in a
-                mntput(path->mnt);
+         * local variable for each dentry as we look at it so that we don't see
-                path->mnt = mounted;
+         * the components of that value change under us */
-                path->dentry = dget(mounted->mnt_root);
+        while (managed = ACCESS_ONCE(path->dentry->d_flags),
+               managed &= DCACHE_MANAGED_DENTRY,
+               unlikely(managed != 0)) {
+                /* Allow the filesystem to manage the transit without i_mutex
+                 * being held. */
+                if (managed & DCACHE_MANAGE_TRANSIT) {
+                        BUG_ON(!path->dentry->d_op);
+                        BUG_ON(!path->dentry->d_op->d_manage);
+                        ret = path->dentry->d_op->d_manage(path->dentry,
+                                                           false, false);
+                        if (ret < 0)
+                                return ret == -EISDIR ? 0 : ret;
+                }
+                /* Transit to a mounted filesystem. */
+                if (managed & DCACHE_MOUNTED) {
+                        struct vfsmount *mounted = lookup_mnt(path);
+                        if (mounted) {
+                                dput(path->dentry);
+                                if (need_mntput)
+                                        mntput(path->mnt);
+                                path->mnt = mounted;
+                                path->dentry = dget(mounted->mnt_root);
+                                need_mntput = true;
+                                continue;
+                        }
+                        /* Something is mounted on this dentry in another
+                         * namespace and/or whatever was mounted there in this
+                         * namespace got unmounted before we managed to get the
+                         * vfsmount_lock */
+                }
+                /* Handle an automount point */
+                if (managed & DCACHE_NEED_AUTOMOUNT) {
+                        ret = follow_automount(path, flags, &need_mntput);
+                        if (ret < 0)
+                                return ret == -EISDIR ? 0 : ret;
+                        continue;
+                }
+                /* We didn't change the current path point */
+                break;
        }
+        return 0;
 }
-int follow_down(struct path *path)
+int follow_down_one(struct path *path)
 {
        struct vfsmount *mounted;
@@ -939,13 +1024,41 @@ int follow_down(struct path *path)
        return 0;
 }
+/*
+ * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
+ * meet a managed dentry and we're not walking to "..".  True is returned to
+ * continue, false to abort.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+                               struct inode **inode, bool reverse_transit)
+{
+        while (d_mountpoint(path->dentry)) {
+                struct vfsmount *mounted;
+                if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+                    !reverse_transit &&
+                    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+                        return false;
+                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+                if (!mounted)
+                        break;
+                path->mnt = mounted;
+                path->dentry = mounted->mnt_root;
+                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+                *inode = path->dentry->d_inode;
+        }
+        if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                return reverse_transit;
+        return true;
+}
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
        struct inode *inode = nd->inode;
        set_root_rcu(nd);
-        while(1) {
+        while (1) {
                if (nd->path.dentry == nd->root.dentry &&
                    nd->path.mnt == nd->root.mnt) {
                        break;
@@ -968,12 +1081,80 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                inode = nd->path.dentry->d_inode;
        }
-        __follow_mount_rcu(nd, &nd->path, &inode);
+        __follow_mount_rcu(nd, &nd->path, &inode, true);
        nd->inode = inode;
        return 0;
 }
+/*
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ *
+ * Care must be taken as namespace_sem may be held (indicated by mounting_here
+ * being true).
+ */
+int follow_down(struct path *path, bool mounting_here)
+{
+        unsigned managed;
+        int ret;
+        while (managed = ACCESS_ONCE(path->dentry->d_flags),
+               unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+                /* Allow the filesystem to manage the transit without i_mutex
+                 * being held.
+                 *
+                 * We indicate to the filesystem if someone is trying to mount
+                 * something here.  This gives autofs the chance to deny anyone
+                 * other than its daemon the right to mount on its
+                 * superstructure.
+                 *
+                 * The filesystem may sleep at this point.
+                 */
+                if (managed & DCACHE_MANAGE_TRANSIT) {
+                        BUG_ON(!path->dentry->d_op);
+                        BUG_ON(!path->dentry->d_op->d_manage);
+                        ret = path->dentry->d_op->d_manage(
+                                path->dentry, mounting_here, false);
+                        if (ret < 0)
+                                return ret == -EISDIR ? 0 : ret;
+                }
+                /* Transit to a mounted filesystem. */
+                if (managed & DCACHE_MOUNTED) {
+                        struct vfsmount *mounted = lookup_mnt(path);
+                        if (!mounted)
+                                break;
+                        dput(path->dentry);
+                        mntput(path->mnt);
+                        path->mnt = mounted;
+                        path->dentry = dget(mounted->mnt_root);
+                        continue;
+                }
+                /* Don't handle automount points here */
+                break;
+        }
+        return 0;
+}
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+        while (d_mountpoint(path->dentry)) {
+                struct vfsmount *mounted = lookup_mnt(path);
+                if (!mounted)
+                        break;
+                dput(path->dentry);
+                mntput(path->mnt);
+                path->mnt = mounted;
+                path->dentry = dget(mounted->mnt_root);
+        }
+}
 static void follow_dotdot(struct nameidata *nd)
 {
        set_root(nd);
@@ -1038,12 +1219,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
        struct inode *dir;
+        int err;
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                int err = parent->d_op->d_hash(parent, nd->inode, name);
+                err = parent->d_op->d_hash(parent, nd->inode, name);
                if (err < 0)
                        return err;
        }
@@ -1070,22 +1253,30 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                nd->seq = seq;
                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
                        goto need_revalidate;
+done2:
                path->mnt = mnt;
                path->dentry = dentry;
-                __follow_mount_rcu(nd, path, inode);
+                if (likely(__follow_mount_rcu(nd, path, inode, false)))
-        } else {
+                        return 0;
-                dentry = __d_lookup(parent, name);
+                if (nameidata_drop_rcu(nd))
-                if (!dentry)
+                        return -ECHILD;
-                        goto need_lookup;
+                /* fallthru */
+        }
+        dentry = __d_lookup(parent, name);
+        if (!dentry)
+                goto need_lookup;
 found:
-                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+        if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-                        goto need_revalidate;
+                goto need_revalidate;
 done:
-                path->mnt = mnt;
+        path->mnt = mnt;
-                path->dentry = dentry;
+        path->dentry = dentry;
-                __follow_mount(path);
+        err = follow_managed(path, nd->flags);
-                *inode = path->dentry->d_inode;
+        if (unlikely(err < 0)) {
-        }
+                path_put_conditional(path, nd);
+                return err;
+        }
+        *inode = path->dentry->d_inode;
        return 0;
 need_lookup:
@@ -1124,6 +1315,8 @@ need_revalidate:
                goto need_lookup;
        if (IS_ERR(dentry))
                goto fail;
+        if (nd->flags & LOOKUP_RCU)
+                goto done2;
        goto done;
 fail:
@@ -1131,17 +1324,6 @@ fail:
 }
 /*
- * This is a temporary kludge to deal with "automount" symlinks; proper
- * solution is to trigger them on follow_mount(), so that do_lookup()
- * would DTRT.  To be killed before 2.6.34-final.
- */
-static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
-{
-        return inode && unlikely(inode->i_op->follow_link) &&
-                ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
-}
-/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
@@ -1279,7 +1461,8 @@ last_component:
                err = do_lookup(nd, &this, &next, &inode);
                if (err)
                        break;
-                if (follow_on_final(inode, lookup_flags)) {
+                if (inode && unlikely(inode->i_op->follow_link) &&
+                    (lookup_flags & LOOKUP_FOLLOW)) {
                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
                                return -ECHILD;
                        BUG_ON(inode != next.dentry->d_inode);
@@ -1950,8 +2133,9 @@ int may_open(struct path *path, int acc_mode, int flag)
        return break_lease(inode, flag);
 }
-static int handle_truncate(struct path *path)
+static int handle_truncate(struct file *filp)
 {
+        struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
@@ -1965,7 +2149,7 @@ static int handle_truncate(struct path *path)
        if (!error) {
                error = do_truncate(path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
-                                    NULL);
+                                    filp);
        }
        put_write_access(inode);
        return error;
@@ -2063,7 +2247,7 @@ static struct file *finish_open(struct nameidata *nd,
        }
        if (!IS_ERR(filp)) {
                if (will_truncate) {
-                        error = handle_truncate(&nd->path);
+                        error = handle_truncate(filp);
                        if (error) {
                                fput(filp);
                                filp = ERR_PTR(error);
@@ -2104,11 +2288,13 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                dir = nd->path.dentry;
        case LAST_DOT:
                if (need_reval_dot(dir)) {
-                        error = d_revalidate(nd->path.dentry, nd);
+                        int status = d_revalidate(nd->path.dentry, nd);
-                        if (!error)
+                        if (!status)
-                                error = -ESTALE;
+                                status = -ESTALE;
-                        if (error < 0)
+                        if (status < 0) {
+                                error = status;
                                goto exit;
+                        }
                }
                /* fallthrough */
        case LAST_ROOT:
@@ -2178,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (open_flag & O_EXCL)
                goto exit_dput;
-        if (__follow_mount(path)) {
+        error = follow_managed(path, nd->flags);
-                error = -ELOOP;
+        if (error < 0)
-                if (open_flag & O_NOFOLLOW)
+                goto exit_dput;
-                        goto exit_dput;
-        }
        error = -ENOENT;
        if (!path->dentry->d_inode)
@@ -2327,11 +2511,11 @@ reval:
        nd.flags = flags;
        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
-                struct path holder;
+                struct path link = path;
+                struct inode *linki = link.dentry->d_inode;
                void *cookie;
                error = -ELOOP;
-                /* S_ISDIR part is a temporary automount kludge */
+                if (!(nd.flags & LOOKUP_FOLLOW))
-                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
                        goto exit_dput;
                if (count++ == 32)
                        goto exit_dput;
@@ -2347,23 +2531,22 @@ reval:
                 * just set LAST_BIND.
                 */
                nd.flags |= LOOKUP_PARENT;
-                error = security_inode_follow_link(path.dentry, &nd);
+                error = security_inode_follow_link(link.dentry, &nd);
                if (error)
                        goto exit_dput;
-                error = __do_follow_link(&path, &nd, &cookie);
+                error = __do_follow_link(&link, &nd, &cookie);
                if (unlikely(error)) {
-                        if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
+                        if (!IS_ERR(cookie) && linki->i_op->put_link)
-                                nd.inode->i_op->put_link(path.dentry, &nd, cookie);
+                                linki->i_op->put_link(link.dentry, &nd, cookie);
                        /* nd.path had been dropped */
-                        nd.path = path;
+                        nd.path = link;
                        goto out_path;
                }
-                holder = path;
                nd.flags &= ~LOOKUP_PARENT;
                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-                if (nd.inode->i_op->put_link)
+                if (linki->i_op->put_link)
-                        nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
+                        linki->i_op->put_link(link.dentry, &nd, cookie);
-                path_put(&holder);
+                path_put(&link);
        }
 out:
        if (nd.root.mnt)
@@ -3391,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = {
 };
 EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 3ddfd9046c44..7b0b95371696 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -183,7 +183,7 @@ static inline void mnt_dec_count(struct vfsmount *mnt)
 unsigned int mnt_get_count(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-        unsigned int count = atomic_read(&mnt->mnt_longrefs);
+        unsigned int count = 0;
        int cpu;
        for_each_possible_cpu(cpu) {
@@ -217,7 +217,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                if (!mnt->mnt_pcp)
                        goto out_free_devname;
-                atomic_set(&mnt->mnt_longrefs, 1);
+                this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
 #else
                mnt->mnt_count = 1;
                mnt->mnt_writers = 0;
@@ -611,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
        list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
 }
+static inline void __mnt_make_longterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        atomic_inc(&mnt->mnt_longterm);
+#endif
+}
+/* needs vfsmount lock for write */
+static inline void __mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        atomic_dec(&mnt->mnt_longterm);
+#endif
+}
 /*
 * vfsmount lock must be held for write
 */
@@ -624,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
        BUG_ON(parent == mnt);
        list_add_tail(&head, &mnt->mnt_list);
-        list_for_each_entry(m, &head, mnt_list)
+        list_for_each_entry(m, &head, mnt_list) {
                m->mnt_ns = n;
+                __mnt_make_longterm(m);
+        }
        list_splice(&head, n->list.prev);
        list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -734,51 +752,30 @@ static inline void mntfree(struct vfsmount *mnt)
        deactivate_super(sb);
 }
-#ifdef CONFIG_SMP
+static void mntput_no_expire(struct vfsmount *mnt)
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
 {
-        if (!longrefs) {
 put_again:
-                br_read_lock(vfsmount_lock);
+#ifdef CONFIG_SMP
-                if (likely(atomic_read(&mnt->mnt_longrefs))) {
+        br_read_lock(vfsmount_lock);
-                        mnt_dec_count(mnt);
+        if (likely(atomic_read(&mnt->mnt_longterm))) {
-                        br_read_unlock(vfsmount_lock);
+                mnt_dec_count(mnt);
-                        return;
-                }
                br_read_unlock(vfsmount_lock);
-        } else {
+                return;
-                BUG_ON(!atomic_read(&mnt->mnt_longrefs));
-                if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
-                        return;
        }
+        br_read_unlock(vfsmount_lock);
        br_write_lock(vfsmount_lock);
-        if (!longrefs)
+        mnt_dec_count(mnt);
-                mnt_dec_count(mnt);
-        else
-                atomic_dec(&mnt->mnt_longrefs);
        if (mnt_get_count(mnt)) {
                br_write_unlock(vfsmount_lock);
                return;
        }
-        if (unlikely(mnt->mnt_pinned)) {
-                mnt_add_count(mnt, mnt->mnt_pinned + 1);
-                mnt->mnt_pinned = 0;
-                br_write_unlock(vfsmount_lock);
-                acct_auto_close_mnt(mnt);
-                goto put_again;
-        }
-        br_write_unlock(vfsmount_lock);
-        mntfree(mnt);
-}
 #else
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
-{
-put_again:
        mnt_dec_count(mnt);
        if (likely(mnt_get_count(mnt)))
                return;
        br_write_lock(vfsmount_lock);
+#endif
        if (unlikely(mnt->mnt_pinned)) {
                mnt_add_count(mnt, mnt->mnt_pinned + 1);
                mnt->mnt_pinned = 0;
@@ -789,12 +786,6 @@ put_again:
        br_write_unlock(vfsmount_lock);
        mntfree(mnt);
 }
-#endif
-static void mntput_no_expire(struct vfsmount *mnt)
-{
-        __mntput(mnt, 0);
-}
 void mntput(struct vfsmount *mnt)
 {
@@ -802,7 +793,7 @@ void mntput(struct vfsmount *mnt)
                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
                if (unlikely(mnt->mnt_expiry_mark))
                        mnt->mnt_expiry_mark = 0;
-                __mntput(mnt, 0);
+                mntput_no_expire(mnt);
        }
 }
 EXPORT_SYMBOL(mntput);
@@ -815,33 +806,6 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
-void mntput_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-        if (mnt) {
-                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
-                if (unlikely(mnt->mnt_expiry_mark))
-                        mnt->mnt_expiry_mark = 0;
-                __mntput(mnt, 1);
-        }
-#else
-        mntput(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntput_long);
-struct vfsmount *mntget_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
-        if (mnt)
-                atomic_inc(&mnt->mnt_longrefs);
-        return mnt;
-#else
-        return mntget(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntget_long);
 void mnt_pin(struct vfsmount *mnt)
 {
        br_write_lock(vfsmount_lock);
@@ -1216,7 +1180,7 @@ void release_mounts(struct list_head *head)
                        dput(dentry);
                        mntput(m);
                }
-                mntput_long(mnt);
+                mntput(mnt);
        }
 }
@@ -1226,19 +1190,21 @@ void release_mounts(struct list_head *head)
 */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
+        LIST_HEAD(tmp_list);
        struct vfsmount *p;
        for (p = mnt; p; p = next_mnt(p, mnt))
-                list_move(&p->mnt_hash, kill);
+                list_move(&p->mnt_hash, &tmp_list);
        if (propagate)
-                propagate_umount(kill);
+                propagate_umount(&tmp_list);
-        list_for_each_entry(p, kill, mnt_hash) {
+        list_for_each_entry(p, &tmp_list, mnt_hash) {
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                __touch_mnt_namespace(p->mnt_ns);
                p->mnt_ns = NULL;
+                __mnt_make_shortterm(p);
                list_del_init(&p->mnt_child);
                if (p->mnt_parent != p) {
                        p->mnt_parent->mnt_ghosts++;
@@ -1246,6 +1212,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
                }
                change_mnt_propagation(p, MS_PRIVATE);
        }
+        list_splice(&tmp_list, kill);
 }
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1844,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
                return err;
        down_write(&namespace_sem);
-        while (d_mountpoint(path->dentry) &&
+        err = follow_down(path, true);
-               follow_down(path))
+        if (err < 0)
-                ;
+                goto out;
        err = -EINVAL;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
                goto out;
@@ -1904,6 +1872,8 @@ out:
        return err;
 }
+static int do_add_mount(struct vfsmount *, struct path *, int);
 /*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
@@ -1912,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
                        int mnt_flags, char *name, void *data)
 {
        struct vfsmount *mnt;
+        int err;
        if (!type)
                return -EINVAL;
@@ -1924,15 +1895,47 @@ static int do_new_mount(struct path *path, char *type, int flags,
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
-        return do_add_mount(mnt, path, mnt_flags, NULL);
+        err = do_add_mount(mnt, path, mnt_flags);
+        if (err)
+                mntput(mnt);
+        return err;
+}
+int finish_automount(struct vfsmount *m, struct path *path)
+{
+        int err;
+        /* The new mount record should have at least 2 refs to prevent it being
+         * expired before we get a chance to add it
+         */
+        BUG_ON(mnt_get_count(m) < 2);
+        if (m->mnt_sb == path->mnt->mnt_sb &&
+            m->mnt_root == path->dentry) {
+                err = -ELOOP;
+                goto fail;
+        }
+        err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+        if (!err)
+                return 0;
+fail:
+        /* remove m from any expiration list it may be on */
+        if (!list_empty(&m->mnt_expire)) {
+                down_write(&namespace_sem);
+                br_write_lock(vfsmount_lock);
+                list_del_init(&m->mnt_expire);
+                br_write_unlock(vfsmount_lock);
+                up_write(&namespace_sem);
+        }
+        mntput(m);
+        mntput(m);
+        return err;
 }
 /*
 * add a mount into a namespace's mount tree
- * - provide the option of adding the new mount to an expiration list
 */
-int do_add_mount(struct vfsmount *newmnt, struct path *path,
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
-                 int mnt_flags, struct list_head *fslist)
 {
        int err;
@@ -1940,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
-        while (d_mountpoint(path->dentry) &&
+        err = follow_down(path, true);
-               follow_down(path))
+        if (err < 0)
-                ;
+                goto unlock;
        err = -EINVAL;
        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
                goto unlock;
@@ -1958,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
                goto unlock;
        newmnt->mnt_flags = mnt_flags;
-        if ((err = graft_tree(newmnt, path)))
+        err = graft_tree(newmnt, path);
-                goto unlock;
-        if (fslist) /* add to the specified expiration list */
-                list_add_tail(&newmnt->mnt_expire, fslist);
-        up_write(&namespace_sem);
-        return 0;
 unlock:
        up_write(&namespace_sem);
-        mntput_long(newmnt);
        return err;
 }
-EXPORT_SYMBOL_GPL(do_add_mount);
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+        down_write(&namespace_sem);
+        br_write_lock(vfsmount_lock);
+        list_add_tail(&mnt->mnt_expire, expiry_list);
+        br_write_unlock(vfsmount_lock);
+        up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
 /*
 * process a list of expirable mountpoints with the intent of discarding any
@@ -2262,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
        return new_ns;
 }
+void mnt_make_longterm(struct vfsmount *mnt)
+{
+        __mnt_make_longterm(mnt);
+}
+void mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
+                return;
+        br_write_lock(vfsmount_lock);
+        atomic_dec(&mnt->mnt_longterm);
+        br_write_unlock(vfsmount_lock);
+#endif
+}
 /*
 * Allocate a new namespace structure and populate it with contents
 * copied from the namespace of the passed in task structure.
@@ -2299,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        q = new_ns->root;
        while (p) {
                q->mnt_ns = new_ns;
+                __mnt_make_longterm(q);
                if (fs) {
                        if (p == fs->root.mnt) {
+                                fs->root.mnt = mntget(q);
+                                __mnt_make_longterm(q);
+                                mnt_make_shortterm(p);
                                rootmnt = p;
-                                fs->root.mnt = mntget_long(q);
                        }
                        if (p == fs->pwd.mnt) {
+                                fs->pwd.mnt = mntget(q);
+                                __mnt_make_longterm(q);
+                                mnt_make_shortterm(p);
                                pwdmnt = p;
-                                fs->pwd.mnt = mntget_long(q);
                        }
                }
                p = next_mnt(p, mnt_ns->root);
@@ -2315,9 +2347,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        up_write(&namespace_sem);
        if (rootmnt)
-                mntput_long(rootmnt);
+                mntput(rootmnt);
        if (pwdmnt)
-                mntput_long(pwdmnt);
+                mntput(pwdmnt);
        return new_ns;
 }
@@ -2350,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
        new_ns = alloc_mnt_ns();
        if (!IS_ERR(new_ns)) {
                mnt->mnt_ns = new_ns;
+                __mnt_make_longterm(mnt);
                new_ns->root = mnt;
                list_add(&new_ns->list, &new_ns->root->mnt_list);
        }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 28f136d4aaec..f6946bb5cb55 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -21,9 +21,7 @@
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
-#include "ncplib_kernel.h"
 static void ncp_read_volume_list(struct file *, void *, filldir_t,
                                struct ncp_cache_control *);
@@ -82,7 +80,7 @@ static int ncp_compare_dentry(const struct dentry *, const struct inode *,
                unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
-static const struct dentry_operations ncp_dentry_operations =
+const struct dentry_operations ncp_dentry_operations =
 {
        .d_revalidate   = ncp_lookup_validate,
        .d_hash         = ncp_hash_dentry,
@@ -90,14 +88,6 @@ static const struct dentry_operations ncp_dentry_operations =
        .d_delete       = ncp_delete_dentry,
 };
-const struct dentry_operations ncp_root_dentry_operations =
-{
-        .d_hash         = ncp_hash_dentry,
-        .d_compare      = ncp_compare_dentry,
-        .d_delete       = ncp_delete_dentry,
-};
 #define ncp_namespace(i)        (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
 static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
@@ -309,6 +299,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
        int res, val = 0, len;
        __u8 __name[NCP_MAXPATHLEN + 1];
+        if (dentry == dentry->d_sb->s_root)
+                return 1;
        if (nd->flags & LOOKUP_RCU)
                return -ECHILD;
@@ -637,7 +630,6 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                entry->ino = iunique(dir->i_sb, 2);
                inode = ncp_iget(dir->i_sb, entry);
                if (inode) {
-                        d_set_d_op(newdent, &ncp_dentry_operations);
                        d_instantiate(newdent, inode);
                        if (!hashed)
                                d_rehash(newdent);
@@ -893,7 +885,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
        if (inode) {
                ncp_new_dentry(dentry);
 add_entry:
-                d_set_d_op(dentry, &ncp_dentry_operations);
                d_add(dentry, inode);
                error = 0;
        }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index cb50aaf981df..0ed65e0c3dfe 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,8 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
-#include "ncplib_kernel.h"
 static int ncp_fsync(struct file *file, int datasync)
 {
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9b39a5dd4131..00a1d1c3d3a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -31,11 +31,9 @@
 #include <linux/seq_file.h>
 #include <linux/namei.h>
-#include <linux/ncp_fs.h>
 #include <net/sock.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 #include "getopt.h"
 #define NCP_DEFAULT_FILE_MODE 0600
@@ -544,6 +542,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = NCP_SUPER_MAGIC;
        sb->s_op = &ncp_sops;
+        sb->s_d_op = &ncp_dentry_operations;
        sb->s_bdi = &server->bdi;
        server = NCP_SBP(sb);
@@ -723,7 +722,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root)
                goto out_no_root;
-        d_set_d_op(sb->s_root, &ncp_root_dentry_operations);
        return 0;
 out_no_root:
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index d40a547e3377..790e92a9ec63 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,11 +20,9 @@
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/ncp_fs.h>
 #include <asm/uaccess.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 /* maximum limit for ncp_objectname_ioctl */
 #define NCP_OBJECT_NAME_MAX_LEN 4096
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 56f5b3a0e1ee..a7c07b44b100 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,12 +16,12 @@
 #include <linux/mman.h>
 #include <linux/string.h>
 #include <linux/fcntl.h>
-#include <linux/ncp_fs.h>
-#include "ncplib_kernel.h"
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include "ncp_fs.h"
 /*
 * Fill in the supplied page for mmap
 * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 000000000000..31831afe1c3b
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
+#include <linux/ncp_fs.h>
+#include "ncp_fs_i.h"
+#include "ncp_fs_sb.h"
+/* define because it is easy to change PRINTK to {*}PRINTK */
+#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
+#undef NCPFS_PARANOIA
+#ifdef NCPFS_PARANOIA
+#define PPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define PPRINTK(format, args...)
+#endif
+#ifndef DEBUG_NCP
+#define DEBUG_NCP 0
+#endif
+#if DEBUG_NCP > 0
+#define DPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DPRINTK(format, args...)
+#endif
+#if DEBUG_NCP > 1
+#define DDPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DDPRINTK(format, args...)
+#endif
+#define NCP_MAX_RPC_TIMEOUT (6*HZ)
+struct ncp_entry_info {
+        struct nw_info_struct   i;
+        ino_t                   ino;
+        int                     opened;
+        int                     access;
+        unsigned int            volume;
+        __u8                    file_handle[6];
+};
+static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+#define NCP_SERVER(inode)       NCP_SBP((inode)->i_sb)
+static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
+{
+        return container_of(inode, struct ncp_inode_info, vfs_inode);
+}
+/* linux/fs/ncpfs/inode.c */
+int ncp_notify_change(struct dentry *, struct iattr *);
+struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
+void ncp_update_inode(struct inode *, struct ncp_entry_info *);
+void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
+/* linux/fs/ncpfs/dir.c */
+extern const struct inode_operations ncp_dir_inode_operations;
+extern const struct file_operations ncp_dir_operations;
+extern const struct dentry_operations ncp_dentry_operations;
+int ncp_conn_logged_in(struct super_block *);
+int ncp_date_dos2unix(__le16 time, __le16 date);
+void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
+/* linux/fs/ncpfs/ioctl.c */
+long ncp_ioctl(struct file *, unsigned int, unsigned long);
+long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
+/* linux/fs/ncpfs/sock.c */
+int ncp_request2(struct ncp_server *server, int function,
+        void* reply, int max_reply_size);
+static inline int ncp_request(struct ncp_server *server, int function) {
+        return ncp_request2(server, function, server->packet, server->packet_size);
+}
+int ncp_connect(struct ncp_server *server);
+int ncp_disconnect(struct ncp_server *server);
+void ncp_lock_server(struct ncp_server *server);
+void ncp_unlock_server(struct ncp_server *server);
+/* linux/fs/ncpfs/symlink.c */
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern const struct address_space_operations ncp_symlink_aops;
+int ncp_symlink(struct inode*, struct dentry*, const char*);
+#endif
+/* linux/fs/ncpfs/file.c */
+extern const struct inode_operations ncp_file_inode_operations;
+extern const struct file_operations ncp_file_operations;
+int ncp_make_open(struct inode *, int);
+/* linux/fs/ncpfs/mmap.c */
+int ncp_mmap(struct file *, struct vm_area_struct *);
+/* linux/fs/ncpfs/ncplib_kernel.c */
+int ncp_make_closed(struct inode *);
+#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 000000000000..4b0bec477846
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
+/*
+ *  ncp_fs_i.h
+ *
+ *  Copyright (C) 1995 Volker Lendecke
+ *
+ */
+#ifndef _LINUX_NCP_FS_I
+#define _LINUX_NCP_FS_I
+/*
+ * This is the ncpfs part of the inode structure. This must contain
+ * all the information we need to work with an inode after creation.
+ */
+struct ncp_inode_info {
+        __le32  dirEntNum;
+        __le32  DosDirNum;
+        __u8    volNumber;
+        __le32  nwattr;
+        struct mutex open_mutex;
+        atomic_t        opened;
+        int     access;
+        int     flags;
+#define NCPI_KLUDGE_SYMLINK     0x0001
+        __u8    file_handle[6];
+        struct inode vfs_inode;
+};
+#endif  /* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 000000000000..4af803f13516
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
+/*
+ *  ncp_fs_sb.h
+ *
+ *  Copyright (C) 1995, 1996 by Volker Lendecke
+ *
+ */
+#ifndef _NCP_FS_SB
+#define _NCP_FS_SB
+#include <linux/types.h>
+#include <linux/ncp_mount.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/workqueue.h>
+#define NCP_DEFAULT_OPTIONS 0           /* 2 for packet signatures */
+struct sock;
+struct ncp_mount_data_kernel {
+        unsigned long    flags;         /* NCP_MOUNT_* flags */
+        unsigned int     int_flags;     /* internal flags */
+#define NCP_IMOUNT_LOGGEDIN_POSSIBLE    0x0001
+        __kernel_uid32_t mounted_uid;   /* Who may umount() this filesystem? */
+        struct pid      *wdog_pid;      /* Who cares for our watchdog packets? */
+        unsigned int     ncp_fd;        /* The socket to the ncp port */
+        unsigned int     time_out;      /* How long should I wait after
+                                           sending a NCP request? */
+        unsigned int     retry_count;   /* And how often should I retry? */
+        unsigned char    mounted_vol[NCP_VOLNAME_LEN + 1];
+        __kernel_uid32_t uid;
+        __kernel_gid32_t gid;
+        __kernel_mode_t  file_mode;
+        __kernel_mode_t  dir_mode;
+        int              info_fd;
+};
+struct ncp_server {
+        struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of
+                                           interest for us later, so we store
+                                           it completely. */
+        __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
+        struct file *ncp_filp;  /* File pointer to ncp socket */
+        struct socket *ncp_sock;/* ncp socket */
+        struct file *info_filp;
+        struct socket *info_sock;
+        u8 sequence;
+        u8 task;
+        u16 connection;         /* Remote connection number */
+        u8 completion;          /* Status message from server */
+        u8 conn_status;         /* Bit 4 = 1 ==> Server going down, no
+                                   requests allowed anymore.
+                                   Bit 0 = 1 ==> Server is down. */
+        int buffer_size;        /* Negotiated bufsize */
+        int reply_size;         /* Size of last reply */
+        int packet_size;
+        unsigned char *packet;  /* Here we prepare requests and
+                                   receive replies */
+        unsigned char *txbuf;   /* Storage for current request */
+        unsigned char *rxbuf;   /* Storage for reply to current request */
+        int lock;               /* To prevent mismatch in protocols. */
+        struct mutex mutex;
+        int current_size;       /* for packet preparation */
+        int has_subfunction;
+        int ncp_reply_size;
+        int root_setuped;
+        struct mutex root_setup_lock;
+        /* info for packet signing */
+        int sign_wanted;        /* 1=Server needs signed packets */
+        int sign_active;        /* 0=don't do signing, 1=do */
+        char sign_root[8];      /* generated from password and encr. key */
+        char sign_last[16];     
+        /* Authentication info: NDS or BINDERY, username */
+        struct {
+                int     auth_type;
+                size_t  object_name_len;
+                void*   object_name;
+                int     object_type;
+        } auth;
+        /* Password info */
+        struct {
+                size_t  len;
+                void*   data;
+        } priv;
+        struct rw_semaphore auth_rwsem;
+        /* nls info: codepage for volume and charset for I/O */
+        struct nls_table *nls_vol;
+        struct nls_table *nls_io;
+        /* maximum age in jiffies */
+        atomic_t dentry_ttl;
+        /* miscellaneous */
+        unsigned int flags;
+        spinlock_t requests_lock;       /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
+        void (*data_ready)(struct sock* sk, int len);
+        void (*error_report)(struct sock* sk);
+        void (*write_space)(struct sock* sk);   /* STREAM mode only */
+        struct {
+                struct work_struct tq;          /* STREAM/DGRAM: data/error ready */
+                struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */
+                struct mutex creq_mutex;        /* DGRAM only: lock accesses to rcv.creq */
+                unsigned int state;             /* STREAM only: receiver state */
+                struct {
+                        __u32 magic __packed;
+                        __u32 len __packed;
+                        __u16 type __packed;
+                        __u16 p1 __packed;
+                        __u16 p2 __packed;
+                        __u16 p3 __packed;
+                        __u16 type2 __packed;
+                } buf;                          /* STREAM only: temporary buffer */
+                unsigned char* ptr;             /* STREAM only: pointer to data */
+                size_t len;                     /* STREAM only: length of data to receive */
+        } rcv;
+        struct {
+                struct list_head requests;      /* STREAM only: queued requests */
+                struct work_struct tq;          /* STREAM only: transmitter ready */
+                struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */
+        } tx;
+        struct timer_list timeout_tm;           /* DGRAM only: timeout timer */
+        struct work_struct timeout_tq;          /* DGRAM only: associated queue, we run timers from process context */
+        int timeout_last;                       /* DGRAM only: current timeout length */
+        int timeout_retries;                    /* DGRAM only: retries left */
+        struct {
+                size_t len;
+                __u8 data[128];
+        } unexpected_packet;
+        struct backing_dev_info bdi;
+};
+extern void ncp_tcp_rcv_proc(struct work_struct *work);
+extern void ncp_tcp_tx_proc(struct work_struct *work);
+extern void ncpdgram_rcv_proc(struct work_struct *work);
+extern void ncpdgram_timeout_proc(struct work_struct *work);
+extern void ncpdgram_timeout_call(unsigned long server);
+extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_write_space(struct sock* sk);
+extern void ncp_tcp_error_report(struct sock* sk);
+#define NCP_FLAG_UTF8   1
+#define NCP_CLR_FLAG(server, flag)      ((server)->flags &= ~(flag))
+#define NCP_SET_FLAG(server, flag)      ((server)->flags |= (flag))
+#define NCP_IS_FLAG(server, flag)       ((server)->flags & (flag))
+static inline int ncp_conn_valid(struct ncp_server *server)
+{
+        return ((server->conn_status & 0x11) == 0);
+}
+static inline void ncp_invalidate_conn(struct ncp_server *server)
+{
+        server->conn_status |= 0x01;
+}
+#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index a95615a0b6ac..981a95617fc9 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -11,7 +11,7 @@
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 static inline void assert_server_locked(struct ncp_server *server)
 {
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 1220df75ff22..09881e6aa5ad 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -32,8 +32,6 @@
 #include <linux/ctype.h>
 #endif /* CONFIG_NCPFS_NLS */
-#include <linux/ncp_fs.h>
 #define NCP_MIN_SYMLINK_SIZE    8
 #define NCP_MAX_SYMLINK_SIZE    512
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index d8b2d7e6910b..08907599dcd2 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <linux/ncp.h>
 #include <linux/bitops.h>
+#include "ncp_fs.h"
 #include "ncpsign_kernel.h"
 /* i386: 32-bit, little endian, handles mis-alignment */
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index 6451a68381cc..d9a1438bb1f6 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -8,8 +8,6 @@
 #ifndef _NCPSIGN_KERNEL_H
 #define _NCPSIGN_KERNEL_H
-#include <linux/ncp_fs.h>
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
 int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 668bd267346e..3a1587222c8a 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -28,7 +28,7 @@
 #include <linux/poll.h>
 #include <linux/file.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
 #include "ncpsign_kernel.h"
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index c634fd17b337..661f861d80c6 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -25,13 +25,11 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/ncp_fs.h>
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
 /* these magic numbers must appear in the symlink file -- this makes it a bit
   more resilient against the magic attributes being set on random files. */
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 93a8b3bd69e3..199016528fcb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,9 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#if defined(CONFIG_NFS_V4_1)
 #include <linux/sunrpc/bc_xprt.h>
-#endif
 #include <net/inet_sock.h>
@@ -137,6 +135,33 @@ out_err:
 #if defined(CONFIG_NFS_V4_1)
 /*
+ *  * CB_SEQUENCE operations will fail until the callback sessionid is set.
+ *   */
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+        struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
+        struct nfs4_sessionid *bc_sid;
+        if (!serv->sv_bc_xprt)
+                return -EINVAL;
+        /* on success freed in xprt_free */
+        bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
+        if (!bc_sid)
+                return -ENOMEM;
+        memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
+                NFS4_MAX_SESSIONID_LEN);
+        spin_lock_bh(&serv->sv_cb_lock);
+        serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
+        spin_unlock_bh(&serv->sv_cb_lock);
+        dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
+                ((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
+                ((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
+                serv->sv_bc_xprt);
+        return 0;
+}
+/*
 * The callback service for NFSv4.1 callbacks
 */
 static int
@@ -177,30 +202,38 @@ nfs41_callback_svc(void *vrqstp)
 struct svc_rqst *
 nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 {
-        struct svc_xprt *bc_xprt;
+        struct svc_rqst *rqstp;
-        struct svc_rqst *rqstp = ERR_PTR(-ENOMEM);
+        int ret;
-        dprintk("--> %s\n", __func__);
+        /*
-        /* Create a svc_sock for the service */
+         * Create an svc_sock for the back channel service that shares the
-        bc_xprt = svc_sock_create(serv, xprt->prot);
+         * fore channel connection.
-        if (!bc_xprt)
+         * Returns the input port (0) and sets the svc_serv bc_xprt on success
+         */
+        ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+                              SVC_SOCK_ANONYMOUS);
+        if (ret < 0) {
+                rqstp = ERR_PTR(ret);
                goto out;
+        }
        /*
         * Save the svc_serv in the transport so that it can
         * be referenced when the session backchannel is initialized
         */
-        serv->bc_xprt = bc_xprt;
        xprt->bc_serv = serv;
        INIT_LIST_HEAD(&serv->sv_cb_list);
        spin_lock_init(&serv->sv_cb_lock);
        init_waitqueue_head(&serv->sv_cb_waitq);
        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-        if (IS_ERR(rqstp))
+        if (IS_ERR(rqstp)) {
-                svc_sock_destroy(bc_xprt);
+                svc_xprt_put(serv->sv_bc_xprt);
+                serv->sv_bc_xprt = NULL;
+        }
 out:
-        dprintk("--> %s return %p\n", __func__, rqstp);
+        dprintk("--> %s return %ld\n", __func__,
+                IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
        return rqstp;
 }
@@ -233,6 +266,10 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
                struct nfs_callback_data *cb_info)
 {
 }
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /*
@@ -328,6 +365,9 @@ static int check_gss_callback_principal(struct nfs_client *clp,
        struct rpc_clnt *r = clp->cl_rpcclient;
        char *p = svc_gss_principal(rqstp);
+        /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+        if (clp->cl_minorversion != 0)
+                return SVC_DROP;
        /*
         * It might just be a normal user principal, in which case
         * userspace won't bother to tell us the name at all.
@@ -345,6 +385,23 @@ static int check_gss_callback_principal(struct nfs_client *clp,
        return SVC_OK;
 }
+/* pg_authenticate method helper */
+static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp)
+{
+        struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp);
+        int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0;
+        dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc);
+        if (svc_is_backchannel(rqstp))
+                /* Sessionid (usually) set after CB_NULL ping */
+                return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
+                                                  is_cb_compound);
+        else
+                /* No callback identifier in pg_authenticate */
+                return nfs4_find_client_no_ident(svc_addr(rqstp));
+}
+/* pg_authenticate method for nfsv4 callback threads. */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
        struct nfs_client *clp;
@@ -352,7 +409,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
        int ret = SVC_OK;
        /* Don't talk to strangers */
-        clp = nfs_find_client(svc_addr(rqstp), 4);
+        clp = nfs_cb_find_client(rqstp);
        if (clp == NULL)
                return SVC_DROP;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8dd..d3b44f9bd747 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -34,10 +34,17 @@ enum nfs4_callback_opnum {
        OP_CB_ILLEGAL = 10044,
 };
+struct cb_process_state {
+        __be32                  drc_status;
+        struct nfs_client       *clp;
+        struct nfs4_sessionid   *svc_sid; /* v4.1 callback service sessionid */
+};
 struct cb_compound_hdr_arg {
        unsigned int taglen;
        const char *tag;
        unsigned int minorversion;
+        unsigned int cb_ident; /* v4.0 callback identifier */
        unsigned nops;
 };
@@ -103,14 +110,23 @@ struct cb_sequenceres {
        uint32_t                        csr_target_highestslotid;
 };
-extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                       struct cb_sequenceres *res);
+                                       struct cb_sequenceres *res,
+                                       struct cb_process_state *cps);
 extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
                                             const nfs4_stateid *stateid);
 #define RCA4_TYPE_MASK_RDATA_DLG        0
 #define RCA4_TYPE_MASK_WDATA_DLG        1
+#define RCA4_TYPE_MASK_DIR_DLG         2
+#define RCA4_TYPE_MASK_FILE_LAYOUT     3
+#define RCA4_TYPE_MASK_BLK_LAYOUT      4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
 struct cb_recallanyargs {
        struct sockaddr *craa_addr;
@@ -118,25 +134,52 @@ struct cb_recallanyargs {
        uint32_t        craa_type_mask;
 };
-extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+                                        void *dummy,
+                                        struct cb_process_state *cps);
 struct cb_recallslotargs {
        struct sockaddr *crsa_addr;
        uint32_t        crsa_target_max_slots;
 };
-extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
-                                          void *dummy);
+                                         void *dummy,
+                                         struct cb_process_state *cps);
+struct cb_layoutrecallargs {
+        struct sockaddr         *cbl_addr;
+        uint32_t                cbl_recall_type;
+        uint32_t                cbl_layout_type;
+        uint32_t                cbl_layoutchanged;
+        union {
+                struct {
+                        struct nfs_fh           cbl_fh;
+                        struct pnfs_layout_range cbl_range;
+                        nfs4_stateid            cbl_stateid;
+                };
+                struct nfs_fsid         cbl_fsid;
+        };
+};
-#endif /* CONFIG_NFS_V4_1 */
+extern unsigned nfs4_callback_layoutrecall(
+        struct cb_layoutrecallargs *args,
+        void *dummy, struct cb_process_state *cps);
-extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
+extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
-extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+extern void nfs4_cb_take_slot(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                                    struct cb_getattrres *res,
+                                    struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                                   struct cb_process_state *cps);
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
 extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
                                            const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4 */
 /*
 * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2950fca0c61b..4bb91cb2620d 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,30 +12,33 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 #ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 #endif
- 
-__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                             struct cb_getattrres *res,
+                             struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs_delegation *delegation;
        struct nfs_inode *nfsi;
        struct inode *inode;
+        res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+                goto out;
        res->bitmap[0] = res->bitmap[1] = 0;
        res->status = htonl(NFS4ERR_BADHANDLE);
-        clp = nfs_find_client(args->addr, 4);
-        if (clp == NULL)
-                goto out;
        dprintk("NFS: GETATTR callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        inode = nfs_delegation_find_inode(clp, &args->fh);
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
        if (inode == NULL)
-                goto out_putclient;
+                goto out;
        nfsi = NFS_I(inode);
        rcu_read_lock();
        delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 out_iput:
        rcu_read_unlock();
        iput(inode);
-out_putclient:
-        nfs_put_client(clp);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
        return res->status;
 }
-__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                            struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct inode *inode;
        __be32 res;
        
-        res = htonl(NFS4ERR_BADHANDLE);
+        res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->addr, 4);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        do {
+        res = htonl(NFS4ERR_BADHANDLE);
-                struct nfs_client *prev = clp;
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+        if (inode == NULL)
-                inode = nfs_delegation_find_inode(clp, &args->fh);
+                goto out;
-                if (inode != NULL) {
+        /* Set up a helper thread to actually return the delegation */
-                        /* Set up a helper thread to actually return the delegation */
+        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
-                        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+        case 0:
-                                case 0:
+                res = 0;
-                                        res = 0;
+                break;
-                                        break;
+        case -ENOENT:
-                                case -ENOENT:
+                if (res != 0)
-                                        if (res != 0)
+                        res = htonl(NFS4ERR_BAD_STATEID);
-                                                res = htonl(NFS4ERR_BAD_STATEID);
+                break;
-                                        break;
+        default:
-                                default:
+                res = htonl(NFS4ERR_RESOURCE);
-                                        res = htonl(NFS4ERR_RESOURCE);
+        }
-                        }
+        iput(inode);
-                        iput(inode);
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
        return res;
@@ -113,6 +108,139 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
 #if defined(CONFIG_NFS_V4_1)
+static u32 initiate_file_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        bool found = false;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        LIST_HEAD(free_me_list);
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if (nfs_compare_fh(&args->cbl_fh,
+                                   &NFS_I(lo->plh_inode)->fh))
+                        continue;
+                ino = igrab(lo->plh_inode);
+                if (!ino)
+                        continue;
+                found = true;
+                /* Without this, layout can be freed as soon
+                 * as we release cl_lock.
+                 */
+                get_layout_hdr(lo);
+                break;
+        }
+        spin_unlock(&clp->cl_lock);
+        if (!found)
+                return NFS4ERR_NOMATCHING_LAYOUT;
+        spin_lock(&ino->i_lock);
+        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+            mark_matching_lsegs_invalid(lo, &free_me_list,
+                                        args->cbl_range.iomode))
+                rv = NFS4ERR_DELAY;
+        else
+                rv = NFS4ERR_NOMATCHING_LAYOUT;
+        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&free_me_list);
+        put_layout_hdr(lo);
+        iput(ino);
+        return rv;
+}
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        struct pnfs_layout_hdr *tmp;
+        LIST_HEAD(recall_list);
+        LIST_HEAD(free_me_list);
+        struct pnfs_layout_range range = {
+                .iomode = IOMODE_ANY,
+                .offset = 0,
+                .length = NFS4_MAX_UINT64,
+        };
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if ((args->cbl_recall_type == RETURN_FSID) &&
+                    memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+                           &args->cbl_fsid, sizeof(struct nfs_fsid)))
+                        continue;
+                if (!igrab(lo->plh_inode))
+                        continue;
+                get_layout_hdr(lo);
+                BUG_ON(!list_empty(&lo->plh_bulk_recall));
+                list_add(&lo->plh_bulk_recall, &recall_list);
+        }
+        spin_unlock(&clp->cl_lock);
+        list_for_each_entry_safe(lo, tmp,
+                                 &recall_list, plh_bulk_recall) {
+                ino = lo->plh_inode;
+                spin_lock(&ino->i_lock);
+                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+                        rv = NFS4ERR_DELAY;
+                list_del_init(&lo->plh_bulk_recall);
+                spin_unlock(&ino->i_lock);
+                put_layout_hdr(lo);
+                iput(ino);
+        }
+        pnfs_free_lseg_list(&free_me_list);
+        return rv;
+}
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+                                    struct cb_layoutrecallargs *args)
+{
+        u32 res = NFS4ERR_DELAY;
+        dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+        if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+                goto out;
+        if (args->cbl_recall_type == RETURN_FILE)
+                res = initiate_file_draining(clp, args);
+        else
+                res = initiate_bulk_draining(clp, args);
+        clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+        dprintk("%s returning %i\n", __func__, res);
+        return res;
+}
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+                                  void *dummy, struct cb_process_state *cps)
+{
+        u32 res;
+        dprintk("%s: -->\n", __func__);
+        if (cps->clp)
+                res = do_callback_layoutrecall(cps->clp, args);
+        else
+                res = NFS4ERR_OP_NOT_IN_SESSION;
+        dprintk("%s: exit with status = %d\n", __func__, res);
+        return cpu_to_be32(res);
+}
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+        struct cb_layoutrecallargs args;
+        /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+        memset(&args, 0, sizeof(args));
+        args.cbl_recall_type = RETURN_ALL;
+        /* FIXME we ignore errors, what should we do? */
+        do_callback_layoutrecall(clp, &args);
+}
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
 {
        if (delegation == NULL)
@@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 }
 /*
- * Returns a pointer to a held 'struct nfs_client' that matches the server's
- * address, major version number, and session ID.  It is the caller's
- * responsibility to release the returned reference.
- *
- * Returns NULL if there are no connections with sessions, or if no session
- * matches the one of interest.
- */
- static struct nfs_client *find_client_with_session(
-        const struct sockaddr *addr, u32 nfsversion,
-        struct nfs4_sessionid *sessionid)
-{
-        struct nfs_client *clp;
-        clp = nfs_find_client(addr, 4);
-        if (clp == NULL)
-                return NULL;
-        do {
-                struct nfs_client *prev = clp;
-                if (clp->cl_session != NULL) {
-                        if (memcmp(clp->cl_session->sess_id.data,
-                                        sessionid->data,
-                                        NFS4_MAX_SESSIONID_LEN) == 0) {
-                                /* Returns a held reference to clp */
-                                return clp;
-                        }
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
-        return NULL;
-}
-/*
 * For each referring call triple, check the session's slot table for
 * a match.  If the slot is in use and the sequence numbers match, the
 * client is still waiting for a response to the original request.
@@ -276,20 +368,34 @@ out:
 }
 __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                struct cb_sequenceres *res)
+                              struct cb_sequenceres *res,
+                              struct cb_process_state *cps)
 {
        struct nfs_client *clp;
        int i;
        __be32 status;
+        cps->clp = NULL;
        status = htonl(NFS4ERR_BADSESSION);
-        clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
+        /* Incoming session must match the callback session */
+        if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
+                goto out;
+        clp = nfs4_find_client_sessionid(args->csa_addr,
+                                         &args->csa_sessionid, 1);
        if (clp == NULL)
                goto out;
+        /* state manager is resetting the session */
+        if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+                status = NFS4ERR_DELAY;
+                goto out;
+        }
        status = validate_seqid(&clp->cl_session->bc_slot_table, args);
        if (status)
-                goto out_putclient;
+                goto out;
        /*
         * Check for pending referring calls.  If a match is found, a
@@ -298,7 +404,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
         */
        if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
                status = htonl(NFS4ERR_DELAY);
-                goto out_putclient;
+                goto out;
        }
        memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,83 +413,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        res->csr_slotid = args->csa_slotid;
        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+        nfs4_cb_take_slot(clp);
+        cps->clp = clp; /* put in nfs4_callback_compound */
-out_putclient:
-        nfs_put_client(clp);
 out:
        for (i = 0; i < args->csa_nrclists; i++)
                kfree(args->csa_rclists[i].rcl_refcalls);
        kfree(args->csa_rclists);
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
+        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                res->csr_status = 0;
+                cps->drc_status = status;
-        else
+                status = 0;
+        } else
                res->csr_status = status;
        dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
                ntohl(status), ntohl(res->csr_status));
        return status;
 }
-__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+        return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+                               struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        __be32 status;
        fmode_t flags = 0;
-        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->craa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL_ANY callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+        status = cpu_to_be32(NFS4ERR_INVAL);
+        if (!validate_bitmap_values(args->craa_type_mask))
+                goto out;
+        status = cpu_to_be32(NFS4_OK);
        if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags = FMODE_READ;
        if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags |= FMODE_WRITE;
+        if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+                     &args->craa_type_mask))
+                pnfs_recall_all_layouts(cps->clp);
        if (flags)
-                nfs_expire_all_delegation_types(clp, flags);
+                nfs_expire_all_delegation_types(cps->clp, flags);
-        status = htonl(NFS4_OK);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
 }
 /* Reduce the fore channel's max_slots to the target value */
-__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+                                struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs4_slot_table *fc_tbl;
        __be32 status;
        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->crsa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
                args->crsa_target_max_slots);
-        fc_tbl = &clp->cl_session->fc_slot_table;
+        fc_tbl = &cps->clp->cl_session->fc_slot_table;
        status = htonl(NFS4ERR_BAD_HIGH_SLOT);
        if (args->crsa_target_max_slots > fc_tbl->max_slots ||
            args->crsa_target_max_slots < 1)
-                goto out_putclient;
+                goto out;
        status = htonl(NFS4_OK);
        if (args->crsa_target_max_slots == fc_tbl->max_slots)
-                goto out_putclient;
+                goto out;
        fc_tbl->target_max_slots = args->crsa_target_max_slots;
-        nfs41_handle_recall_slot(clp);
+        nfs41_handle_recall_slot(cps->clp);
-out_putclient:
-        nfs_put_client(clp);    /* balance nfs_find_client */
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0edf..23112c263f81 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
 #include "callback.h"
+#include "internal.h"
 #define CB_OP_TAGLEN_MAXSZ      (512)
 #define CB_OP_HDR_RES_MAXSZ     (2 + CB_OP_TAGLEN_MAXSZ)
@@ -22,6 +24,7 @@
 #define CB_OP_RECALL_RES_MAXSZ  (CB_OP_HDR_RES_MAXSZ)
 #if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ    (CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ + \
                                        4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ       (CB_OP_HDR_RES_MAXSZ)
@@ -33,7 +36,8 @@
 /* Internal error code */
 #define NFS4ERR_RESOURCE_HDR    11050
-typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *,
+                                        struct cb_process_state *);
 typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
 typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        hdr->minorversion = ntohl(*p++);
        /* Check minor version is zero or one. */
        if (hdr->minorversion <= 1) {
-                p++;    /* skip callback_ident */
+                hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
        } else {
                printk(KERN_WARNING "%s: NFSv4 server callback with "
                        "illegal minor version %u!\n",
@@ -220,6 +224,66 @@ out:
 #if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct cb_layoutrecallargs *args)
+{
+        __be32 *p;
+        __be32 status = 0;
+        uint32_t iomode;
+        args->cbl_addr = svc_addr(rqstp);
+        p = read_buf(xdr, 4 * sizeof(uint32_t));
+        if (unlikely(p == NULL)) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        args->cbl_layout_type = ntohl(*p++);
+        /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+         * as it is unuseable and ignored with the other types.
+         */
+        iomode = ntohl(*p++);
+        args->cbl_layoutchanged = ntohl(*p++);
+        args->cbl_recall_type = ntohl(*p++);
+        if (args->cbl_recall_type == RETURN_FILE) {
+                args->cbl_range.iomode = iomode;
+                status = decode_fh(xdr, &args->cbl_fh);
+                if (unlikely(status != 0))
+                        goto out;
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_range.offset);
+                p = xdr_decode_hyper(p, &args->cbl_range.length);
+                status = decode_stateid(xdr, &args->cbl_stateid);
+                if (unlikely(status != 0))
+                        goto out;
+        } else if (args->cbl_recall_type == RETURN_FSID) {
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+                p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+        } else if (args->cbl_recall_type != RETURN_ALL) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+                __func__,
+                args->cbl_layout_type, iomode,
+                args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+}
 static __be32 decode_sessionid(struct xdr_stream *xdr,
                                 struct nfs4_sessionid *sid)
 {
@@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_SEQUENCE:
        case OP_CB_RECALL_ANY:
        case OP_CB_RECALL_SLOT:
+        case OP_CB_LAYOUTRECALL:
                *op = &callback_ops[op_nr];
                break;
-        case OP_CB_LAYOUTRECALL:
        case OP_CB_NOTIFY_DEVICEID:
        case OP_CB_NOTIFY:
        case OP_CB_PUSH_DELEG:
@@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS_OK);
 }
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+        struct nfs4_slot_table *tbl = &session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        /*
+         * Let the state manager know callback processing done.
+         * A single slot, so highest used slotid is either 0 or -1
+         */
+        tbl->highest_used_slotid--;
+        nfs4_check_drain_bc_complete(session);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+        if (clp && clp->cl_session)
+                nfs4_callback_free_slot(clp->cl_session);
+}
+/* A single slot, so highest used slotid is either 0 or -1 */
+void nfs4_cb_take_slot(struct nfs_client *clp)
+{
+        struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        tbl->highest_used_slotid++;
+        BUG_ON(tbl->highest_used_slotid != 0);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
 #else /* CONFIG_NFS_V4_1 */
 static __be32
@@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 static __be32
@@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 static __be32 process_op(uint32_t minorversion, int nop,
                struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
-                struct xdr_stream *xdr_out, void *resp, int* drc_status)
+                struct xdr_stream *xdr_out, void *resp,
+                struct cb_process_state *cps)
 {
        struct callback_op *op = &callback_ops[0];
        unsigned int op_nr;
@@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (status)
                goto encode_hdr;
-        if (*drc_status) {
+        if (cps->drc_status) {
-                status = *drc_status;
+                status = cps->drc_status;
                goto encode_hdr;
        }
@@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (maxlen > 0 && maxlen < PAGE_SIZE) {
                status = op->decode_args(rqstp, xdr_in, argp);
                if (likely(status == 0))
-                        status = op->process_op(argp, resp);
+                        status = op->process_op(argp, resp, cps);
        } else
                status = htonl(NFS4ERR_RESOURCE);
-        /* Only set by OP_CB_SEQUENCE processing */
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                *drc_status = status;
-                status = 0;
-        }
 encode_hdr:
        res = encode_op_hdr(xdr_out, op_nr, status);
        if (unlikely(res))
@@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_compound_hdr_arg hdr_arg = { 0 };
        struct cb_compound_hdr_res hdr_res = { NULL };
        struct xdr_stream xdr_in, xdr_out;
-        __be32 *p;
+        __be32 *p, status;
-        __be32 status, drc_status = 0;
+        struct cb_process_state cps = {
+                .drc_status = 0,
+                .clp = NULL,
+        };
        unsigned int nops = 0;
        dprintk("%s: start\n", __func__);
@@ -696,6 +792,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        if (status == __constant_htonl(NFS4ERR_RESOURCE))
                return rpc_garbage_args;
+        if (hdr_arg.minorversion == 0) {
+                cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+                if (!cps.clp)
+                        return rpc_drop_reply;
+        } else
+                cps.svc_sid = bc_xprt_sid(rqstp);
        hdr_res.taglen = hdr_arg.taglen;
        hdr_res.tag = hdr_arg.tag;
        if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +806,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        while (status == 0 && nops != hdr_arg.nops) {
                status = process_op(hdr_arg.minorversion, nops, rqstp,
-                                    &xdr_in, argp, &xdr_out, resp, &drc_status);
+                                    &xdr_in, argp, &xdr_out, resp, &cps);
                nops++;
        }
@@ -716,6 +819,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        *hdr_res.status = status;
        *hdr_res.nops = htonl(nops);
+        nfs4_cb_free_slot(cps.clp);
+        nfs_put_client(cps.clp);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
        return rpc_success;
 }
@@ -739,6 +844,12 @@ static struct callback_op callback_ops[] = {
                .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
        },
 #if defined(CONFIG_NFS_V4_1)
+        [OP_CB_LAYOUTRECALL] = {
+                .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+                .decode_args =
+                        (callback_decode_arg_t)decode_layoutrecall_args,
+                .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+        },
        [OP_CB_SEQUENCE] = {
                .process_op = (callback_process_op_t)nfs4_callback_sequence,
                .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0870d0d4efc0..192f2f860265 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -56,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
 static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+#ifdef CONFIG_NFS_V4
+static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+        int ret = 0;
+        if (clp->rpc_ops->version != 4 || minorversion != 0)
+                return ret;
+retry:
+        if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+                return -ENOMEM;
+        spin_lock(&nfs_client_lock);
+        ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+        spin_unlock(&nfs_client_lock);
+        if (ret == -EAGAIN)
+                goto retry;
+        return ret;
+}
+#endif /* CONFIG_NFS_V4 */
 /*
 * RPC cruft for NFS
@@ -144,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_proto = cl_init->proto;
 #ifdef CONFIG_NFS_V4
-        INIT_LIST_HEAD(&clp->cl_delegations);
+        err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+        if (err)
+                goto error_cleanup;
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -170,21 +197,17 @@ error_0:
 }
 #ifdef CONFIG_NFS_V4
-/*
- * Clears/puts all minor version specific parts from an nfs_client struct
- * reverting it to minorversion 0.
- */
-static void nfs4_clear_client_minor_version(struct nfs_client *clp)
-{
 #ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(clp)) {
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+        if (nfs4_has_session(clp))
                nfs4_destroy_session(clp->cl_session);
-                clp->cl_session = NULL;
-        }
-        clp->cl_mvops = nfs_v4_minor_ops[0];
-#endif /* CONFIG_NFS_V4_1 */
 }
+#else /* CONFIG_NFS_V4_1 */
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * Destroy the NFS4 callback service
@@ -199,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 {
        if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
                nfs4_kill_renewd(clp);
-        nfs4_clear_client_minor_version(clp);
+        nfs4_shutdown_session(clp);
        nfs4_destroy_callback(clp);
        if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
                nfs_idmap_delete(clp);
        rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 }
+/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
+void nfs_cleanup_cb_ident_idr(void)
+{
+        idr_destroy(&cb_ident_idr);
+}
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+        if (clp->cl_cb_ident)
+                idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+        rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
 }
+void nfs_cleanup_cb_ident_idr(void)
+{
+}
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
 #endif /* CONFIG_NFS_V4 */
 /*
@@ -248,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp)
        if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
                list_del(&clp->cl_share_link);
+                nfs_cb_idr_remove_locked(clp);
                spin_unlock(&nfs_client_lock);
                BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -363,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
        return 0;
 }
-/*
+/* Common match routine for v4.0 and v4.1 callback services */
- * Find a client by IP address and protocol version
+bool
- * - returns NULL if no such client
+nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
- */
+                     u32 minorversion)
-struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
-{
-        struct nfs_client *clp;
-        spin_lock(&nfs_client_lock);
-        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-                /* Don't match clients that failed to initialise properly */
-                if (!(clp->cl_cons_state == NFS_CS_READY ||
-                      clp->cl_cons_state == NFS_CS_SESSION_INITING))
-                        continue;
-                /* Different NFS versions cannot share the same nfs_client */
-                if (clp->rpc_ops->version != nfsversion)
-                        continue;
-                /* Match only the IP address, not the port number */
-                if (!nfs_sockaddr_match_ipaddr(addr, clap))
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
-}
-/*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
- */
-struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
 {
-        struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
+        struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-        u32 nfsvers = clp->rpc_ops->version;
-        spin_lock(&nfs_client_lock);
+        /* Don't match clients that failed to initialise */
-        list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
+        if (!(clp->cl_cons_state == NFS_CS_READY ||
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+            clp->cl_cons_state == NFS_CS_SESSION_INITING))
+                return false;
-                /* Don't match clients that failed to initialise properly */
+        /* Match the version and minorversion */
-                if (clp->cl_cons_state != NFS_CS_READY)
+        if (clp->rpc_ops->version != 4 ||
-                        continue;
+            clp->cl_minorversion != minorversion)
+                return false;
-                /* Different NFS versions cannot share the same nfs_client */
+        /* Match only the IP address, not the port number */
-                if (clp->rpc_ops->version != nfsvers)
+        if (!nfs_sockaddr_match_ipaddr(addr, clap))
-                        continue;
+                return false;
-                /* Match only the IP address, not the port number */
+        return true;
-                if (!nfs_sockaddr_match_ipaddr(sap, clap))
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
 }
 /*
@@ -988,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
        target->options = source->options;
 }
+static void nfs_server_insert_lists(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        spin_lock(&nfs_client_lock);
+        list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+}
+static void nfs_server_remove_lists(struct nfs_server *server)
+{
+        spin_lock(&nfs_client_lock);
+        list_del_rcu(&server->client_link);
+        list_del(&server->master_link);
+        spin_unlock(&nfs_client_lock);
+        synchronize_rcu();
+}
 /*
 * Allocate and initialise a server record
 */
@@ -1004,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
        /* Zero out the NFS state stuff */
        INIT_LIST_HEAD(&server->client_link);
        INIT_LIST_HEAD(&server->master_link);
+        INIT_LIST_HEAD(&server->delegations);
        atomic_set(&server->active, 0);
@@ -1019,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void)
                return NULL;
        }
+        pnfs_init_server(server);
        return server;
 }
@@ -1029,11 +1067,8 @@ void nfs_free_server(struct nfs_server *server)
 {
        dprintk("--> nfs_free_server()\n");
+        nfs_server_remove_lists(server);
        unset_pnfs_layoutdriver(server);
-        spin_lock(&nfs_client_lock);
-        list_del(&server->client_link);
-        list_del(&server->master_link);
-        spin_unlock(&nfs_client_lock);
        if (server->destroy != NULL)
                server->destroy(server);
@@ -1108,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                (unsigned long long) server->fsid.major,
                (unsigned long long) server->fsid.minor);
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr);
        return server;
@@ -1125,6 +1156,101 @@ error:
 #ifdef CONFIG_NFS_V4
 /*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by IP address, protocol version, and minorversion
+ *
+ * Called from the pg_authenticate method. The callback identifier
+ * is not used as it has not been decoded.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_no_ident(const struct sockaddr *addr)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 0) == false)
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(int cb_ident)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        clp = idr_find(&cb_ident_idr, cb_ident);
+        if (clp)
+                atomic_inc(&clp->cl_count);
+        spin_unlock(&nfs_client_lock);
+        return clp;
+}
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
+ * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
+ * can arrive before the callback sessionid is set. For CB_NULL calls,
+ * find a client by IP address protocol version, and minorversion.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid, int is_cb_compound)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 1) == false)
+                        continue;
+                if (!nfs4_has_session(clp))
+                        continue;
+                /* Match sessionid unless cb_null call*/
+                if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data,
+                    sid->data, NFS4_MAX_SESSIONID_LEN) != 0))
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+#else /* CONFIG_NFS_V4_1 */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid, int is_cb_compound)
+{
+        return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+/*
 * Initialize the NFS4 callback service
 */
 static int nfs4_init_callback(struct nfs_client *clp)
@@ -1342,11 +1468,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
                server->namelen = NFS4_MAXNAMLEN;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
 out:
        nfs_free_fattr(fattr);
@@ -1551,11 +1673,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        if (error < 0)
                goto out_free_server;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr_fsinfo);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 1fd62fc49be3..364e4328f392 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -40,11 +40,23 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 {
        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
+/**
+ * nfs_have_delegation - check if inode has a delegation
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
 int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
        struct nfs_delegation *delegation;
@@ -119,10 +131,15 @@ again:
        return 0;
 }
-/*
+/**
- * Set up a delegation on an inode
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
 */
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+                                  struct nfs_openres *res)
 {
        struct nfs_delegation *delegation;
        struct rpc_cred *oldcred = NULL;
@@ -175,38 +192,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
        return inode;
 }
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+static struct nfs_delegation *
-                                                           const nfs4_stateid *stateid,
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-                                                           struct nfs_client *clp)
+                             struct nfs_server *server)
 {
        struct nfs_delegation *delegation =
                rcu_dereference_protected(nfsi->delegation,
-                                          lockdep_is_held(&clp->cl_lock));
+                                lockdep_is_held(&server->nfs_client->cl_lock));
        if (delegation == NULL)
                goto nomatch;
        spin_lock(&delegation->lock);
-        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
-                                sizeof(delegation->stateid.data)) != 0)
-                goto nomatch_unlock;
        list_del_rcu(&delegation->super_list);
        delegation->inode = NULL;
        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
        spin_unlock(&delegation->lock);
        return delegation;
-nomatch_unlock:
-        spin_unlock(&delegation->lock);
 nomatch:
        return NULL;
 }
-/*
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
- * Set up a delegation on an inode
+                                                    struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs_delegation *delegation;
+        spin_lock(&clp->cl_lock);
+        delegation = nfs_detach_delegation_locked(nfsi, server);
+        spin_unlock(&clp->cl_lock);
+        return delegation;
+}
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation, *old_delegation;
        struct nfs_delegation *freeme = NULL;
@@ -227,7 +258,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        spin_lock(&clp->cl_lock);
        old_delegation = rcu_dereference_protected(nfsi->delegation,
-                                                   lockdep_is_held(&clp->cl_lock));
+                                        lockdep_is_held(&clp->cl_lock));
        if (old_delegation != NULL) {
                if (memcmp(&delegation->stateid, &old_delegation->stateid,
                                        sizeof(old_delegation->stateid)) == 0 &&
@@ -246,9 +277,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                        delegation = NULL;
                        goto out;
                }
-                freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
+                freeme = nfs_detach_delegation_locked(nfsi, server);
        }
-        list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+        list_add_rcu(&delegation->super_list, &server->delegations);
        nfsi->delegation_state = delegation->type;
        rcu_assign_pointer(nfsi->delegation, delegation);
        delegation = NULL;
@@ -290,73 +321,85 @@ out:
        return err;
 }
-/*
+/**
- * Return all delegations that have been marked for return
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_client_return_marked_delegations(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
        int err = 0;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
-                if (inode == NULL)
+                                                        &delegation->flags))
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL) {
+                                                                server);
-                        filemap_flush(inode->i_mapping);
+                        rcu_read_unlock();
-                        err = __nfs_inode_return_delegation(inode, delegation, 0);
+                        if (delegation != NULL) {
+                                filemap_flush(inode->i_mapping);
+                                err = __nfs_inode_return_delegation(inode,
+                                                                delegation, 0);
+                        }
+                        iput(inode);
+                        if (!err)
+                                goto restart;
+                        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+                        return err;
                }
-                iput(inode);
-                if (!err)
-                        goto restart;
-                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                return err;
        }
        rcu_read_unlock();
        return 0;
 }
-/*
+/**
- * This function returns the delegation without reclaiming opens
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
- * or protecting against delegation reclaims.
+ * @inode: inode to process
- * It is therefore really only safe to be called from
+ *
- * nfs4_clear_inode()
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
 */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL)
                        nfs_do_return_delegation(inode, delegation, 0);
        }
 }
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
 int nfs_inode_return_delegation(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        int err = 0;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
                        nfs_wb_all(inode);
                        err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -365,46 +408,61 @@ int nfs_inode_return_delegation(struct inode *inode)
        return err;
 }
-static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
 {
+        struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
 }
-/*
+/**
- * Return all delegations associated to a super block
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
 */
 void nfs_super_return_all_delegations(struct super_block *sb)
 {
-        struct nfs_client *clp = NFS_SB(sb)->nfs_client;
+        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_delegation *delegation;
        if (clp == NULL)
                return;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
-                if (delegation->inode != NULL && delegation->inode->i_sb == sb)
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
                spin_unlock(&delegation->lock);
        }
        rcu_read_unlock();
        if (nfs_client_return_marked_delegations(clp) != 0)
                nfs4_schedule_state_manager(clp);
 }
-static
+static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
-void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+                                                 fmode_t flags)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
                        continue;
                if (delegation->type & flags)
-                        nfs_mark_return_delegation(clp, delegation);
+                        nfs_mark_return_delegation(delegation);
        }
+}
+static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+                                                        fmode_t flags)
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_all_delegation_types(server, flags);
        rcu_read_unlock();
 }
@@ -419,19 +477,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
                nfs4_schedule_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
 void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
        nfs_client_mark_return_all_delegation_types(clp, flags);
        nfs_delegation_run_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
        nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
 }
-/*
+/**
- * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
+ * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
 */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
@@ -440,29 +511,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
        nfs_client_mark_return_all_delegations(clp);
 }
-static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
                        continue;
-                nfs_mark_return_delegation(clp, delegation);
+                nfs_mark_return_delegation(delegation);
        }
-        rcu_read_unlock();
 }
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 {
-        nfs_client_mark_return_unreferenced_delegations(clp);
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_unreferenced_delegations(server);
+        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
 }
-/*
+/**
- * Asynchronous delegation recall!
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information from CB_RECALL arguments
+ *
+ * Returns zero on success, or a negative errno value.
 */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
+int nfs_async_inode_return_delegation(struct inode *inode,
+                                      const nfs4_stateid *stateid)
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_delegation *delegation;
@@ -474,22 +559,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
                rcu_read_unlock();
                return -ENOENT;
        }
+        nfs_mark_return_delegation(delegation);
-        nfs_mark_return_delegation(clp, delegation);
        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
        return 0;
 }
-/*
+static struct inode *
- * Retrieve the inode associated with a delegation
+nfs_delegation_find_inode_server(struct nfs_server *server,
- */
+                                 const struct nfs_fh *fhandle)
-struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
 {
        struct nfs_delegation *delegation;
        struct inode *res = NULL;
-        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
                if (delegation->inode != NULL &&
                    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -499,49 +583,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
                if (res != NULL)
                        break;
        }
+        return res;
+}
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+                                        const struct nfs_fh *fhandle)
+{
+        struct nfs_server *server;
+        struct inode *res = NULL;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                res = nfs_delegation_find_inode_server(server, fhandle);
+                if (res != NULL)
+                        break;
+        }
        rcu_read_unlock();
        return res;
 }
-/*
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
- * Mark all delegations as needing to be reclaimed
+{
+        struct nfs_delegation *delegation;
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
-                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+                nfs_delegation_mark_reclaim_server(server);
        rcu_read_unlock();
 }
-/*
+/**
- * Reap all unclaimed delegations after reboot recovery is done
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
-                if (inode == NULL)
+                                                &delegation->flags) == 0)
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL)
+                                                                server);
-                        nfs_free_delegation(delegation);
+                        rcu_read_unlock();
-                iput(inode);
-                goto restart;
+                        if (delegation != NULL)
+                                nfs_free_delegation(delegation);
+                        iput(inode);
+                        goto restart;
+                }
        }
        rcu_read_unlock();
 }
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        int ret = 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                if (!list_empty(&server->delegations)) {
+                        ret = 1;
+                        break;
+                }
+        rcu_read_unlock();
+        return ret;
+}
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ *
+ * Returns one and fills in "dst->data" * if inode had a delegation,
+ * otherwise zero is returned.
+ */
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda19..d9322e490c56 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d33da530097a..2c3eb33b904d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,8 +33,8 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-#include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
+#include <linux/xattr.h>
 #include "delegation.h"
 #include "iostat.h"
@@ -125,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 #endif /* CONFIG_NFS_V4 */
@@ -172,7 +173,7 @@ struct nfs_cache_array {
        struct nfs_cache_array_entry array[0];
 };
-typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
@@ -378,14 +379,14 @@ error:
        return error;
 }
-/* Fill in an entry based on the xdr code stored in desc->page */
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
-static
+                      struct nfs_entry *entry, struct xdr_stream *xdr)
-int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
 {
-        __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
+        int error;
-        if (IS_ERR(p))
-                return PTR_ERR(p);
+        error = desc->decode(xdr, entry, desc->plus);
+        if (error)
+                return error;
        entry->fattr->time_start = desc->timestamp;
        entry->fattr->gencount = desc->gencount;
        return 0;
@@ -438,7 +439,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        if (dentry == NULL)
                return;
-        d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
        if (IS_ERR(inode))
                goto out;
@@ -459,25 +459,26 @@ out:
 /* Perform conversion from xdr to cache array */
 static
 int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-                                void *xdr_page, struct page *page, unsigned int buflen)
+                                struct page **xdr_pages, struct page *page, unsigned int buflen)
 {
        struct xdr_stream stream;
-        struct xdr_buf buf;
+        struct xdr_buf buf = {
-        __be32 *ptr = xdr_page;
+                .pages = xdr_pages,
+                .page_len = buflen,
+                .buflen = buflen,
+                .len = buflen,
+        };
+        struct page *scratch;
        struct nfs_cache_array *array;
        unsigned int count = 0;
        int status;
-        buf.head->iov_base = xdr_page;
+        scratch = alloc_page(GFP_KERNEL);
-        buf.head->iov_len = buflen;
+        if (scratch == NULL)
-        buf.tail->iov_len = 0;
+                return -ENOMEM;
-        buf.page_base = 0;
-        buf.page_len = 0;
-        buf.buflen = buf.head->iov_len;
-        buf.len = buf.head->iov_len;
-        xdr_init_decode(&stream, &buf, ptr);
+        xdr_init_decode(&stream, &buf, NULL);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
        do {
                status = xdr_decode(desc, entry, &stream);
@@ -506,6 +507,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
                } else
                        status = PTR_ERR(array);
        }
+        put_page(scratch);
        return status;
 }
@@ -521,7 +524,6 @@ static
 void nfs_readdir_free_large_page(void *ptr, struct page **pages,
                unsigned int npages)
 {
-        vm_unmap_ram(ptr, npages);
        nfs_readdir_free_pagearray(pages, npages);
 }
@@ -530,9 +532,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
 * to nfs_readdir_free_large_page
 */
 static
-void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-        void *ptr;
        unsigned int i;
        for (i = 0; i < npages; i++) {
@@ -541,13 +542,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
                        goto out_freepages;
                pages[i] = page;
        }
+        return 0;
-        ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
-        if (!IS_ERR_OR_NULL(ptr))
-                return ptr;
 out_freepages:
        nfs_readdir_free_pagearray(pages, i);
-        return NULL;
+        return -ENOMEM;
 }
 static
@@ -566,6 +565,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        entry.eof = 0;
        entry.fh = nfs_alloc_fhandle();
        entry.fattr = nfs_alloc_fattr();
+        entry.server = NFS_SERVER(inode);
        if (entry.fh == NULL || entry.fattr == NULL)
                goto out;
@@ -577,8 +577,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        memset(array, 0, sizeof(struct nfs_cache_array));
        array->eof_index = -1;
-        pages_ptr = nfs_readdir_large_page(pages, array_size);
+        status = nfs_readdir_large_page(pages, array_size);
-        if (!pages_ptr)
+        if (status < 0)
                goto out_release_array;
        do {
                unsigned int pglen;
@@ -587,7 +587,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
                if (status < 0)
                        break;
                pglen = status;
-                status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
+                status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
                if (status < 0) {
                        if (status == -ENOSPC)
                                status = 0;
@@ -970,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags))
+        if (IS_AUTOMOUNT(inode))
                return 0;
        if (nd != NULL) {
                /* VFS wants an on-the-wire revalidation */
@@ -1173,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
        .d_revalidate   = nfs_lookup_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
+        .d_automount    = nfs_d_automount,
 };
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1192,8 +1193,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
                goto out;
-        d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
        /*
         * If we're doing an exclusive create, optimize away the lookup
         * but don't hash the dentry.
@@ -1221,7 +1220,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                goto out_unblock_sillyrename;
        }
        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
-        res = (struct dentry *)inode;
+        res = ERR_CAST(inode);
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -1248,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
        .d_revalidate   = nfs_open_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
+        .d_automount    = nfs_d_automount,
 };
 /*
@@ -1337,7 +1337,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                res = ERR_PTR(-ENAMETOOLONG);
                goto out;
        }
-        d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
        /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
         * the dentry. */
@@ -1355,8 +1354,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
                attr.ia_valid = ATTR_MODE;
-                if (!IS_POSIXACL(dir))
+                attr.ia_mode &= ~current_umask();
-                        attr.ia_mode &= ~current_umask();
        } else {
                open_flags &= ~(O_EXCL | O_CREAT);
                attr.ia_valid = 0;
@@ -1410,11 +1408,15 @@ no_open:
 static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct dentry *parent = NULL;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        struct inode *dir;
        struct nfs_open_context *ctx;
        int openflags, ret = 0;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
        if (!is_atomic_open(nd) || d_mountpoint(dentry))
                goto no_open;
@@ -1583,6 +1585,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
        struct iattr attr;
        int error;
+        int open_flags = 0;
        dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1590,7 +1593,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
        attr.ia_mode = mode;
        attr.ia_valid = ATTR_MODE;
-        error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
+        if ((nd->flags & LOOKUP_CREATE) != 0)
+                open_flags = nd->intent.open.flags;
+        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
        if (error != 0)
                goto out_err;
        return 0;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 5596c6a2881e..b5ffe8fa291f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -119,9 +119,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        }
        security_d_instantiate(ret, inode);
-        if (ret->d_op == NULL)
-                d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
 out:
        nfs_free_fattr(fsinfo.fattr);
        return ret;
@@ -227,9 +224,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        security_d_instantiate(ret, inode);
-        if (ret->d_op == NULL)
-                d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
 out:
        nfs_free_fattr(fattr);
        dprintk("<-- nfs4_get_root()\n");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 4e2d9b6b1380..18696882f1c6 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -238,7 +238,7 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
 }
-#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
+#else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 017daa3bed38..d8512423ba72 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -300,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                                else
                                        inode->i_op = &nfs_mountpoint_inode_operations;
                                inode->i_fop = NULL;
-                                set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags);
+                                inode->i_flags |= S_AUTOMOUNT;
                        }
                } else if (S_ISLNK(inode->i_mode))
                        inode->i_op = &nfs_symlink_inode_operations;
@@ -1208,7 +1208,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        /* Update the fsid? */
        if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
                        !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
-                        !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
+                        !IS_AUTOMOUNT(inode))
                server->fsid = fattr->fsid;
        /*
@@ -1410,9 +1410,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 */
 void nfs4_evict_inode(struct inode *inode)
 {
+        pnfs_destroy_layout(NFS_I(inode));
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
-        pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
@@ -1619,6 +1619,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
+        nfs_cleanup_cb_ident_idr();
        unregister_nfs_fs();
        nfs_fs_proc_exit();
        nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e6356b750b77..4644f04b4b46 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -128,9 +128,13 @@ extern void nfs_umount(const struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
+extern void nfs_cleanup_cb_ident_idr(void);
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
+extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
-extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *,
+                           int);
 extern struct nfs_server *nfs_create_server(
                                        const struct nfs_parsed_mount_data *,
                                        struct nfs_fh *);
@@ -185,17 +189,20 @@ extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
-extern int nfs_stat_to_errno(int);
+extern int nfs_stat_to_errno(enum nfs_stat);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs2_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs3_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs4_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
@@ -245,6 +252,7 @@ extern char *nfs_path(const char *base,
                      const struct dentry *droot,
                      const struct dentry *dentry,
                      char *buffer, ssize_t buflen);
+extern struct vfsmount *nfs_d_automount(struct path *path);
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 4f981f1f6689..d4c2d6b7507e 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -236,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info)
                .authflavor     = RPC_AUTH_UNIX,
                .flags          = RPC_CLNT_CREATE_NOPING,
        };
-        struct mountres result;
        struct rpc_message msg  = {
                .rpc_argp       = info->dirpath,
-                .rpc_resp       = &result,
        };
        struct rpc_clnt *clnt;
        int status;
@@ -248,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        clnt = rpc_create(&args);
-        if (unlikely(IS_ERR(clnt)))
+        if (IS_ERR(clnt))
                goto out_clnt_err;
        dprintk("NFS: sending UMNT request for %s:%s\n",
@@ -280,29 +278,20 @@ out_call_err:
 * XDR encode/decode functions for MOUNT
 */
-static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
 {
        const u32 pathname_len = strlen(pathname);
        __be32 *p;
-        if (unlikely(pathname_len > MNTPATHLEN))
+        BUG_ON(pathname_len > MNTPATHLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + pathname_len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, pathname, pathname_len);
-        return 0;
 }
-static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
-                           const char *dirpath)
+                                const char *dirpath)
 {
-        struct xdr_stream xdr;
+        encode_mntdirpath(xdr, dirpath);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mntdirpath(&xdr, dirpath);
 }
 /*
@@ -320,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
                if (mnt_errtbl[i].status == status) {
@@ -351,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
        return 0;
 }
-static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
-                            struct mountres *res)
+                                struct xdr_stream *xdr,
+                                struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_status(xdr, res);
-        status = decode_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        return decode_fhandle(&xdr, res);
+        return decode_fhandle(xdr, res);
 }
 static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -371,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
                if (mnt3_errtbl[i].status == status) {
@@ -394,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
        u32 size;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(size));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        size = ntohl(*p++);
+        size = be32_to_cpup(p);
        if (size > NFS3_FHSIZE || size == 0)
                return -EIO;
@@ -421,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        if (*count == 0)
                return 0;
-        p = xdr_inline_decode(xdr, sizeof(entries));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        entries = ntohl(*p);
+        entries = be32_to_cpup(p);
        dprintk("NFS: received %u auth flavors\n", entries);
        if (entries > NFS_MAX_SECFLAVORS)
                entries = NFS_MAX_SECFLAVORS;
-        p = xdr_inline_decode(xdr, sizeof(u32) * entries);
+        p = xdr_inline_decode(xdr, 4 * entries);
        if (unlikely(p == NULL))
                return -EIO;
@@ -437,7 +424,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
                entries = *count;
        for (i = 0; i < entries; i++) {
-                flavors[i] = ntohl(*p++);
+                flavors[i] = be32_to_cpup(p++);
                dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
        }
        *count = i;
@@ -445,30 +432,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        return 0;
 }
-static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
-                             struct mountres *res)
+                                 struct xdr_stream *xdr,
+                                 struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_fhs_status(xdr, res);
-        status = decode_fhs_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        status = decode_fhandle3(&xdr, res);
+        status = decode_fhandle3(xdr, res);
        if (unlikely(status != 0)) {
                res->errno = -EBADHANDLE;
                return 0;
        }
-        return decode_auth_flavors(&xdr, res);
+        return decode_auth_flavors(xdr, res);
 }
 static struct rpc_procinfo mnt_procedures[] = {
        [MOUNTPROC_MNT] = {
                .p_proc         = MOUNTPROC_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres_sz,
                .p_statidx      = MOUNTPROC_MNT,
@@ -476,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = {
        },
        [MOUNTPROC_UMNT] = {
                .p_proc         = MOUNTPROC_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC_UMNT,
                .p_name         = "UMOUNT",
@@ -486,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = {
 static struct rpc_procinfo mnt3_procedures[] = {
        [MOUNTPROC3_MNT] = {
                .p_proc         = MOUNTPROC3_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres3,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres3,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres3_sz,
                .p_statidx      = MOUNTPROC3_MNT,
@@ -495,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
        },
        [MOUNTPROC3_UMNT] = {
                .p_proc         = MOUNTPROC3_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC3_UMNT,
                .p_name         = "UMOUNT",
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 74aaf3963c10..f32b8603dca8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -97,9 +97,8 @@ Elong:
 }
 /*
- * nfs_follow_mountpoint - handle crossing a mountpoint on the server
+ * nfs_d_automount - Handle crossing a mountpoint on the server
- * @dentry - dentry of mountpoint
+ * @path - The mountpoint
- * @nd - nameidata info
 *
 * When we encounter a mountpoint on the server, we want to set up
 * a mountpoint on the client too, to prevent inode numbers from
@@ -109,87 +108,65 @@ Elong:
 * situation, and that different filesystems may want to use
 * different security flavours.
 */
-static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *nfs_d_automount(struct path *path)
 {
        struct vfsmount *mnt;
-        struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+        struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
        struct dentry *parent;
        struct nfs_fh *fh = NULL;
        struct nfs_fattr *fattr = NULL;
        int err;
-        dprintk("--> nfs_follow_mountpoint()\n");
+        dprintk("--> nfs_d_automount()\n");
-        err = -ESTALE;
+        mnt = ERR_PTR(-ESTALE);
-        if (IS_ROOT(dentry))
+        if (IS_ROOT(path->dentry))
-                goto out_err;
+                goto out_nofree;
-        err = -ENOMEM;
+        mnt = ERR_PTR(-ENOMEM);
        fh = nfs_alloc_fhandle();
        fattr = nfs_alloc_fattr();
        if (fh == NULL || fattr == NULL)
-                goto out_err;
+                goto out;
        dprintk("%s: enter\n", __func__);
-        dput(nd->path.dentry);
-        nd->path.dentry = dget(dentry);
-        /* Look it up again */
+        /* Look it up again to get its attributes */
-        parent = dget_parent(nd->path.dentry);
+        parent = dget_parent(path->dentry);
        err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
-                                                  &nd->path.dentry->d_name,
+                                                  &path->dentry->d_name,
                                                  fh, fattr);
        dput(parent);
-        if (err != 0)
+        if (err != 0) {
-                goto out_err;
+                mnt = ERR_PTR(err);
+                goto out;
+        }
        if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-                mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
+                mnt = nfs_do_refmount(path->mnt, path->dentry);
        else
-                mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
+                mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
-                                      fattr);
-        err = PTR_ERR(mnt);
        if (IS_ERR(mnt))
-                goto out_err;
+                goto out;
-        mntget(mnt);
+        dprintk("%s: done, success\n", __func__);
-        err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
+        mntget(mnt); /* prevent immediate expiration */
-                           &nfs_automount_list);
+        mnt_set_expiry(mnt, &nfs_automount_list);
-        if (err < 0) {
-                mntput(mnt);
-                if (err == -EBUSY)
-                        goto out_follow;
-                goto out_err;
-        }
-        path_put(&nd->path);
-        nd->path.mnt = mnt;
-        nd->path.dentry = dget(mnt->mnt_root);
        schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fh);
-        dprintk("%s: done, returned %d\n", __func__, err);
+out_nofree:
+        dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
-        dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
+        return mnt;
-        return ERR_PTR(err);
-out_err:
-        path_put(&nd->path);
-        goto out;
-out_follow:
-        while (d_mountpoint(nd->path.dentry) &&
-               follow_down(&nd->path))
-                ;
-        err = 0;
-        goto out;
 }
 const struct inode_operations nfs_mountpoint_inode_operations = {
-        .follow_link    = nfs_follow_mountpoint,
        .getattr        = nfs_getattr,
 };
 const struct inode_operations nfs_referral_inode_operations = {
-        .follow_link    = nfs_follow_mountpoint,
 };
 static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5914a1911c95..792cb13a4304 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,584 +61,1008 @@
 #define NFS_readdirres_sz       (1)
 #define NFS_statfsres_sz        (1+NFS_info_sz)
 /*
- * Common NFS XDR functions as inlines
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
 */
-static inline __be32 *
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle)
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
 {
-        memcpy(p, fhandle->data, NFS2_FHSIZE);
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
 }
-static inline __be32 *
+/*
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-        /* NFSv2 handles have a fixed length */
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-        fhandle->size = NFS2_FHSIZE;
+                "Remaining buffer length is %tu words.\n",
-        memcpy(fhandle->data, p, NFS2_FHSIZE);
+                func, xdr->end - xdr->p);
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
+}
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+/*
+ *      typedef opaque  nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+{
+        u32 recvd, count;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = 0;        /* NFSv2 does not pass EOF flag on the wire. */
+        result->count = count;
+        return count;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      enum stat {
+ *              NFS_OK = 0,
+ *              NFSERR_PERM = 1,
+ *              NFSERR_NOENT = 2,
+ *              NFSERR_IO = 5,
+ *              NFSERR_NXIO = 6,
+ *              NFSERR_ACCES = 13,
+ *              NFSERR_EXIST = 17,
+ *              NFSERR_NODEV = 19,
+ *              NFSERR_NOTDIR = 20,
+ *              NFSERR_ISDIR = 21,
+ *              NFSERR_FBIG = 27,
+ *              NFSERR_NOSPC = 28,
+ *              NFSERR_ROFS = 30,
+ *              NFSERR_NAMETOOLONG = 63,
+ *              NFSERR_NOTEMPTY = 66,
+ *              NFSERR_DQUOT = 69,
+ *              NFSERR_STALE = 70,
+ *              NFSERR_WFLUSH = 99
+ *      };
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32*
+/*
-xdr_encode_time(__be32 *p, struct timespec *timep)
+ * 2.3.2.  ftype
+ *
+ *      enum ftype {
+ *              NFNON = 0,
+ *              NFREG = 1,
+ *              NFDIR = 2,
+ *              NFBLK = 3,
+ *              NFCHR = 4,
+ *              NFLNK = 5
+ *      };
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
 {
-        *p++ = htonl(timep->tv_sec);
+        *type = be32_to_cpup(p++);
-        /* Convert nanoseconds into microseconds */
+        if (unlikely(*type > NF2FIFO))
-        *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0);
+                *type = NFBAD;
        return p;
 }
-static inline __be32*
+/*
-xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
+ * 2.3.3.  fhandle
+ *
+ *      typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
-        /*
+        __be32 *p;
-         * Passing the invalid value useconds=1000000 is a
-         * Sun convention for "set to current server time".
+        BUG_ON(fh->size != NFS2_FHSIZE);
-         * It's needed to make permissions checks for the
+        p = xdr_reserve_space(xdr, NFS2_FHSIZE);
-         * "touch" program across v2 mounts to Solaris and
+        memcpy(p, fh->data, NFS2_FHSIZE);
-         * Irix boxes work correctly. See description of
+}
-         * sattr in section 6.1 of "NFS Illustrated" by
-         * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
-         */
+{
-        *p++ = htonl(timep->tv_sec);
+        __be32 *p;
-        *p++ = htonl(1000000);
+        p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fh->size = NFS2_FHSIZE;
+        memcpy(fh->data, p, NFS2_FHSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * 2.3.4.  timeval
+ *
+ *      struct timeval {
+ *              unsigned int seconds;
+ *              unsigned int useconds;
+ *      };
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        if (timep->tv_nsec != 0)
+                *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+        else
+                *p++ = cpu_to_be32(0);
        return p;
 }
-static inline __be32*
+/*
-xdr_decode_time(__be32 *p, struct timespec *timep)
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time".  It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly.  See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+                                              const struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        *p++ = cpu_to_be32(timep->tv_sec);
-        /* Convert microseconds into nanoseconds */
+        *p++ = cpu_to_be32(1000000);
-        timep->tv_nsec = ntohl(*p++) * 1000;
        return p;
 }
-static __be32 *
+static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
+{
+        timep->tv_sec = be32_to_cpup(p++);
+        timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
+        return p;
+}
+/*
+ * 2.3.5.  fattr
+ *
+ *      struct fattr {
+ *              ftype           type;
+ *              unsigned int    mode;
+ *              unsigned int    nlink;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              unsigned int    blocksize;
+ *              unsigned int    rdev;
+ *              unsigned int    blocks;
+ *              unsigned int    fsid;
+ *              unsigned int    fileid;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *              timeval         ctime;
+ *      };
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
        u32 rdev, type;
-        type = ntohl(*p++);
+        __be32 *p;
-        fattr->mode = ntohl(*p++);
-        fattr->nlink = ntohl(*p++);
+        p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
-        fattr->uid = ntohl(*p++);
+        if (unlikely(p == NULL))
-        fattr->gid = ntohl(*p++);
+                goto out_overflow;
-        fattr->size = ntohl(*p++);
-        fattr->du.nfs2.blocksize = ntohl(*p++);
-        rdev = ntohl(*p++);
-        fattr->du.nfs2.blocks = ntohl(*p++);
-        fattr->fsid.major = ntohl(*p++);
-        fattr->fsid.minor = 0;
-        fattr->fileid = ntohl(*p++);
-        p = xdr_decode_time(p, &fattr->atime);
-        p = xdr_decode_time(p, &fattr->mtime);
-        p = xdr_decode_time(p, &fattr->ctime);
        fattr->valid |= NFS_ATTR_FATTR_V2;
+        p = xdr_decode_ftype(p, &type);
+        fattr->mode = be32_to_cpup(p++);
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        fattr->size = be32_to_cpup(p++);
+        fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+        rdev = be32_to_cpup(p++);
        fattr->rdev = new_decode_dev(rdev);
-        if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
+        if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
                fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
                fattr->rdev = 0;
        }
+        fattr->du.nfs2.blocks = be32_to_cpup(p++);
+        fattr->fsid.major = be32_to_cpup(p++);
+        fattr->fsid.minor = 0;
+        fattr->fileid = be32_to_cpup(p++);
+        p = xdr_decode_time(p, &fattr->atime);
+        p = xdr_decode_time(p, &fattr->mtime);
+        xdr_decode_time(p, &fattr->ctime);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * 2.3.6.  sattr
+ *
+ *      struct sattr {
+ *              unsigned int    mode;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *      };
+ */
+#define NFS2_SATTR_NOT_SET      (0xffffffff)
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
        return p;
 }
-static inline __be32 *
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
-        const __be32 not_set = __constant_htonl(0xFFFFFFFF);
+        __be32 *p;
-        *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
+        p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
-        *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
-        *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
-        *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
-        if (attr->ia_valid & ATTR_ATIME_SET) {
+        if (attr->ia_valid & ATTR_MODE)
+                *p++ = cpu_to_be32(attr->ia_mode);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_UID)
+                *p++ = cpu_to_be32(attr->ia_uid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_GID)
+                *p++ = cpu_to_be32(attr->ia_gid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_SIZE)
+                *p++ = cpu_to_be32((u32)attr->ia_size);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_ATIME_SET)
                p = xdr_encode_time(p, &attr->ia_atime);
-        } else if (attr->ia_valid & ATTR_ATIME) {
+        else if (attr->ia_valid & ATTR_ATIME)
                p = xdr_encode_current_server_time(p, &attr->ia_atime);
-        } else {
+        else
-                *p++ = not_set;
+                p = xdr_time_not_set(p);
-                *p++ = not_set;
+        if (attr->ia_valid & ATTR_MTIME_SET)
-        }
+                xdr_encode_time(p, &attr->ia_mtime);
+        else if (attr->ia_valid & ATTR_MTIME)
-        if (attr->ia_valid & ATTR_MTIME_SET) {
+                xdr_encode_current_server_time(p, &attr->ia_mtime);
-                p = xdr_encode_time(p, &attr->ia_mtime);
+        else
-        } else if (attr->ia_valid & ATTR_MTIME) {
+                xdr_time_not_set(p);
-                p = xdr_encode_current_server_time(p, &attr->ia_mtime);
-        } else {
-                *p++ = not_set; 
-                *p++ = not_set;
-        }
-        return p;
 }
 /*
- * NFS encode functions
+ * 2.3.7.  filename
+ *
+ *      typedef string filename<MAXNAMLEN>;
 */
+static void encode_filename(struct xdr_stream *xdr,
+                            const char *name, u32 length)
+{
+        __be32 *p;
+        BUG_ON(length > NFS2_MAXNAMLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+static int decode_filename_inline(struct xdr_stream *xdr,
+                                  const char **name, u32 *length)
+{
+        __be32 *p;
+        u32 count;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *name = (const char *)p;
+        *length = count;
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned filename too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 /*
- * Encode file handle argument
+ * 2.3.8.  path
- * GETATTR, READLINK, STATFS
+ *
+ *      typedef string path<MAXPATHLEN>;
 */
-static int
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
-nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
 {
-        p = xdr_encode_fhandle(p, fh);
+        __be32 *p;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        BUG_ON(length > NFS2_MAXPATHLEN);
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(length);
+        xdr_write_pages(xdr, pages, 0, length);
+}
+static int decode_path(struct xdr_stream *xdr)
+{
+        u32 length, recvd;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p);
+        if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+                goto out_size;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(length > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, length);
+        xdr_terminate_string(xdr->buf, length);
        return 0;
+out_size:
+        dprintk("NFS: returned pathname too long: %u\n", length);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "length %u > received %u\n", length, recvd);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode SETATTR arguments
+ * 2.3.9.  attrstat
+ *
+ *      union attrstat switch (stat status) {
+ *      case NFS_OK:
+ *              fattr attributes;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
-nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        enum nfs_stat status;
-        p = xdr_encode_sattr(p, args->sattr);
+        int error;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Encode directory ops argument
+ * 2.3.10.  diropargs
- * LOOKUP, RMDIR
+ *
+ *      struct diropargs {
+ *              fhandle  dir;
+ *              filename name;
+ *      };
 */
-static int
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
-nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
+                             const char *name, u32 length)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, fh);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_filename(xdr, name, length);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode REMOVE argument
+ * 2.3.11.  diropres
+ *
+ *      union diropres switch (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      fhandle file;
+ *                      fattr   attributes;
+ *              } diropok;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
-nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        int error;
-        p = xdr_encode_array(p, args->name.name, args->name.len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        error = decode_fhandle(xdr, result->fh);
-        return 0;
+        if (unlikely(error))
+                goto out;
+        error = decode_fattr(xdr, result->fattr);
+out:
+        return error;
+}
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_diropok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * NFSv2 XDR encode functions
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nfs_fh *fh)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_fhandle(xdr, fh);
-        unsigned int replen;
+}
-        u32 offset = (u32)args->offset;
+/*
+ * 2.2.3.  sattrargs
+ *
+ *      struct sattrargs {
+ *              fhandle file;
+ *              sattr attributes;
+ *      };
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_sattrargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        encode_sattr(xdr, args->sattr);
+}
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_diropargs *args)
+{
+        encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs_readlinkargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->pglen, NFS_readlinkres_sz);
+}
+/*
+ * 2.2.7.  readargs
+ *
+ *      struct readargs {
+ *              fhandle file;
+ *              unsigned offset;
+ *              unsigned count;
+ *              unsigned totalcount;
+ *      };
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+                            const struct nfs_readargs *args)
+{
+        u32 offset = args->offset;
        u32 count = args->count;
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, args->fh);
-        *p++ = htonl(offset);
-        *p++ = htonl(count);
-        *p++ = htonl(count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
+        *p++ = cpu_to_be32(offset);
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+        *p++ = cpu_to_be32(count);
-                         args->pages, args->pgbase, count);
+        *p = cpu_to_be32(count);
+}
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nfs_readargs *args)
+{
+        encode_readargs(xdr, args);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->count, NFS_readres_sz);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
 }
 /*
- * Decode READ reply
+ * 2.2.9.  writeargs
+ *
+ *      struct writeargs {
+ *              fhandle file;
+ *              unsigned beginoffset;
+ *              unsigned offset;
+ *              unsigned totalcount;
+ *              nfsdata data;
+ *      };
 */
-static int
+static void encode_writeargs(struct xdr_stream *xdr,
-nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+                             const struct nfs_writeargs *args)
 {
-        struct kvec *iov = req->rq_rcv_buf.head;
+        u32 offset = args->offset;
-        size_t hdrlen;
+        u32 count = args->count;
-        u32 count, recvd;
+        __be32 *p;
-        int status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_fattr(p, res->fattr);
-        count = ntohl(*p++);
-        res->eof = 0;
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READ reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        encode_fhandle(xdr, args->fh);
-        if (count > recvd) {
-                dprintk("NFS: server cheating in read reply: "
-                        "count %u > recvd %u\n", count, recvd);
-                count = recvd;
-        }
-        dprintk("RPC:      readres OK count %u\n", count);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
-        if (count < res->count)
+        *p++ = cpu_to_be32(offset);
-                res->count = count;
+        *p++ = cpu_to_be32(offset);
+        *p++ = cpu_to_be32(count);
-        return count;
+        /* nfsdata */
+        *p = cpu_to_be32(count);
+        xdr_write_pages(xdr, args->pages, args->pgbase, count);
 }
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_writeargs *args)
+{
+        encode_writeargs(xdr, args);
+        xdr->buf->flags |= XDRBUF_WRITE;
+}
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 2.2.10.  createargs
+ *
+ *      struct createargs {
+ *              diropargs where;
+ *              sattr attributes;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
-nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_createargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        encode_diropargs(xdr, args->fh, args->name, args->len);
-        u32 offset = (u32)args->offset;
+        encode_sattr(xdr, args->sattr);
-        u32 count = args->count;
+}
-        p = xdr_encode_fhandle(p, args->fh);
-        *p++ = htonl(offset);
-        *p++ = htonl(offset);
-        *p++ = htonl(count);
-        *p++ = htonl(count);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        /* Copy the page array */
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+                                    struct xdr_stream *xdr,
-        sndbuf->flags |= XDRBUF_WRITE;
+                                    const struct nfs_removeargs *args)
-        return 0;
+{
+        encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
 }
 /*
- * Encode create arguments
+ * 2.2.12.  renameargs
- * CREATE, MKDIR
+ *
+ *      struct renameargs {
+ *              diropargs from;
+ *              diropargs to;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
-nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        const struct qstr *old = args->old_name;
-        p = xdr_encode_array(p, args->name, args->len);
+        const struct qstr *new = args->new_name;
-        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        encode_diropargs(xdr, args->old_dir, old->name, old->len);
-        return 0;
+        encode_diropargs(xdr, args->new_dir, new->name, new->len);
 }
 /*
- * Encode RENAME arguments
+ * 2.2.13.  linkargs
+ *
+ *      struct linkargs {
+ *              fhandle from;
+ *              diropargs to;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
-nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+                                  struct xdr_stream *xdr,
+                                  const struct nfs_linkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->old_dir);
+        encode_fhandle(xdr, args->fromfh);
-        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+        encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
-        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode LINK arguments
+ * 2.2.14.  symlinkargs
+ *
+ *      struct symlinkargs {
+ *              diropargs from;
+ *              path to;
+ *              sattr attributes;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
-nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_symlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
-        p = xdr_encode_fhandle(p, args->tofh);
+        encode_path(xdr, args->pages, args->pathlen);
-        p = xdr_encode_array(p, args->toname, args->tolen);
+        encode_sattr(xdr, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SYMLINK arguments
+ * 2.2.17.  readdirargs
+ *
+ *      struct readdirargs {
+ *              fhandle dir;
+ *              nfscookie cookie;
+ *              unsigned count;
+ *      };
 */
-static int
+static void encode_readdirargs(struct xdr_stream *xdr,
-nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
+                               const struct nfs_readdirargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        __be32 *p;
-        size_t pad;
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_fhandle(xdr, args->fh);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
-        *p++ = htonl(args->pathlen);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        *p++ = cpu_to_be32(args->cookie);
+        *p = cpu_to_be32(args->count);
+}
-        /*
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
-         * xdr_encode_pages may have added a few bytes to ensure the
+                                     struct xdr_stream *xdr,
-         * pathname ends on a 4-byte boundary.  Start encoding the
+                                     const struct nfs_readdirargs *args)
-         * attributes after the pad bytes.
+{
-         */
+        encode_readdirargs(xdr, args);
-        pad = sndbuf->tail->iov_len;
+        prepare_reply_buffer(req, args->pages, 0,
-        if (pad > 0)
+                                        args->count, NFS_readdirres_sz);
-                p++;
-        p = xdr_encode_sattr(p, args->sattr);
-        sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
-        return 0;
 }
 /*
- * Encode arguments to readdir call
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             void *__unused)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        enum nfs_stat status;
-        unsigned int replen;
+        int error;
-        u32 count = args->count;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        p = xdr_encode_fhandle(p, args->fh);
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
-        *p++ = htonl(args->cookie);
+                                 struct nfs_fattr *result)
-        *p++ = htonl(count); /* see above */
+{
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        return decode_attrstat(xdr, result);
+}
-        /* Inline the page array */
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2;
+                                 struct nfs_diropok *result)
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
+{
-        return 0;
+        return decode_diropres(xdr, result);
 }
 /*
- * Decode the result of a readdir call.
+ * 2.2.6.  readlinkres
- * We're not really decoding anymore, we just leave the buffer untouched
+ *
- * and only check that it is syntactically correct.
+ *      union readlinkres switch (stat status) {
- * The real decoding happens in nfs_decode_entry below, called directly
+ *      case NFS_OK:
- * from nfs_readdir for each entry.
+ *              path data;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
-nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
+                                    struct xdr_stream *xdr, void *__unused)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        enum nfs_stat status;
-        struct kvec *iov = rcvbuf->head;
+        int error;
-        struct page **page;
-        size_t hdrlen;
+        error = decode_stat(xdr, &status);
-        unsigned int pglen, recvd;
+        if (unlikely(error))
-        int status;
+                goto out;
+        if (status != NFS_OK)
-        if ((status = ntohl(*p++)))
+                goto out_default;
-                return nfs_stat_to_errno(status);
+        error = decode_path(xdr);
+out:
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        return error;
-        if (iov->iov_len < hdrlen) {
+out_default:
-                dprintk("NFS: READDIR reply header overflowed:"
+        return nfs_stat_to_errno(status);
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+}
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
+/*
-        recvd = rcvbuf->len - hdrlen;
+ * 2.2.7.  readres
-        if (pglen > recvd)
+ *
-                pglen = recvd;
+ *      union readres switch (stat status) {
-        page = rcvbuf->pages;
+ *      case NFS_OK:
-        return pglen;
+ *              fattr attributes;
+ *              nfsdata data;
+ *      default:
+ *              void;
+ *      };
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs_readres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_nfsdata(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_writeres *result)
 {
-        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+        /* All NFSv2 writes are "file sync" writes */
-                "Remaining buffer length is %tu words.\n",
+        result->verf->committed = NFS_FILE_SYNC;
-                func, xdr->end - xdr->p);
+        return decode_attrstat(xdr, result->fattr);
 }
-__be32 *
+/**
-nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17.  entry
+ *
+ *      struct entry {
+ *              unsigned        fileid;
+ *              filename        name;
+ *              nfscookie       cookie;
+ *              entry           *nextentry;
+ *      };
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
 {
        __be32 *p;
+        int error;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
-        if (!ntohl(*p++)) {
+        if (*p++ == xdr_zero) {
                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
+                if (unlikely(p == NULL))
                        goto out_overflow;
-                if (!ntohl(*p++))
+                if (*p++ == xdr_zero)
-                        return ERR_PTR(-EAGAIN);
+                        return -EAGAIN;
                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
+                return -EBADCOOKIE;
        }
-        p = xdr_inline_decode(xdr, 8);
+        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
+        entry->ino = be32_to_cpup(p);
-        entry->ino        = ntohl(*p++);
+        error = decode_filename_inline(xdr, &entry->name, &entry->len);
-        entry->len        = ntohl(*p++);
+        if (unlikely(error))
+                return error;
-        p = xdr_inline_decode(xdr, entry->len + 4);
+        /*
-        if (unlikely(!p))
+         * The type (size and byte order) of nfscookie isn't defined in
+         * RFC 1094.  This implementation assumes that it's an XDR uint32.
+         */
+        entry->prev_cookie = entry->cookie;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
                goto out_overflow;
-        entry->name       = (const char *) p;
+        entry->cookie = be32_to_cpup(p);
-        p                += XDR_QUADLEN(entry->len);
-        entry->prev_cookie        = entry->cookie;
-        entry->cookie     = ntohl(*p++);
        entry->d_type = DT_UNKNOWN;
-        p = xdr_inline_peek(xdr, 8);
+        return 0;
-        if (p != NULL)
-                entry->eof = !p[0] && p[1];
-        else
-                entry->eof = 0;
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EAGAIN);
+        return -EAGAIN;
-}
-/*
- * NFS XDR decode functions
- */
-/*
- * Decode simple status reply
- */
-static int
-nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
-{
-        int     status;
-        if ((status = ntohl(*p++)) != 0)
-                status = nfs_stat_to_errno(status);
-        return status;
 }
 /*
- * Decode attrstat reply
+ * 2.2.17.  readdirres
- * GETATTR, SETATTR, WRITE
+ *
- */
+ *      union readdirres switch (stat status) {
-static int
+ *      case NFS_OK:
-nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+ *              struct {
-{
+ *                      entry *entries;
-        int     status;
+ *                      bool eof;
+ *              } readdirok;
-        if ((status = ntohl(*p++)))
+ *      default:
-                return nfs_stat_to_errno(status);
+ *              void;
-        xdr_decode_fattr(p, fattr);
+ *      };
-        return 0;
+ *
-}
+ * Read the directory contents into the page cache, but don't
+ * touch them.  The actual decoding is done by nfs2_decode_dirent()
-/*
+ * during subsequent nfs_readdir() calls.
- * Decode diropres reply
- * LOOKUP, CREATE, MKDIR
 */
-static int
+static int decode_readdirok(struct xdr_stream *xdr)
-nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 {
-        int     status;
+        u32 recvd, pglen;
+        size_t hdrlen;
-        if ((status = ntohl(*p++)))
+        pglen = xdr->buf->page_len;
-                return nfs_stat_to_errno(status);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        p = xdr_decode_fhandle(p, res->fh);
+        recvd = xdr->buf->len - hdrlen;
-        xdr_decode_fattr(p, res->fattr);
+        if (unlikely(pglen > recvd))
-        return 0;
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
 }
-/*
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
- * Encode READLINK args
+                                   struct xdr_stream *xdr, void *__unused)
- */
-static int
-nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        enum nfs_stat status;
-        unsigned int replen;
+        int error;
-        p = xdr_encode_fhandle(p, args->fh);
+        error = decode_stat(xdr, &status);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (unlikely(error))
+                goto out;
-        /* Inline the page array */
+        if (status != NFS_OK)
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2;
+                goto out_default;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
+        error = decode_readdirok(xdr);
-        return 0;
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READLINK reply
+ * 2.2.18.  statfsres
+ *
+ *      union statfsres (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      unsigned tsize;
+ *                      unsigned bsize;
+ *                      unsigned blocks;
+ *                      unsigned bfree;
+ *                      unsigned bavail;
+ *              } info;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
-nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        size_t hdrlen;
-        u32 len, recvd;
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        /* Convert length of symlink */
-        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len) {
-                dprintk("nfs: server returned giant symlink!\n");
-                return -ENAMETOOLONG;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READLINK reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
-        if (recvd < len) {
-                dprintk("NFS: server cheating in readlink reply: "
-                                "count %u > recvd %u\n", len, recvd);
-                return -EIO;
-        }
-        xdr_terminate_string(rcvbuf, len);
+        p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->tsize  = be32_to_cpup(p++);
+        result->bsize  = be32_to_cpup(p++);
+        result->blocks = be32_to_cpup(p++);
+        result->bfree  = be32_to_cpup(p++);
+        result->bavail = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-/*
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
- * Decode WRITE reply
+                                  struct nfs2_fsstat *result)
- */
-static int
-nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
-        res->verf->committed = NFS_FILE_SYNC;
+        enum nfs_stat status;
-        return nfs_xdr_attrstat(req, p, res->fattr);
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_info(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-/*
- * Decode STATFS reply
- */
-static int
-nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
-{
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        res->tsize  = ntohl(*p++);
-        res->bsize  = ntohl(*p++);
-        res->blocks = ntohl(*p++);
-        res->bfree  = ntohl(*p++);
-        res->bavail = ntohl(*p++);
-        return 0;
-}
 /*
 * We need to translate between nfs status return values and
 * the local errno values which may not be the same.
 */
-static struct {
+static const struct {
        int stat;
        int errno;
 } nfs_errtbl[] = {
@@ -678,28 +1102,30 @@ static struct {
        { -1,                   -EIO            }
 };
-/*
+/**
- * Convert an NFS error code to a local one.
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
- * This one is used jointly by NFSv2 and NFSv3.
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
 */
-int
+int nfs_stat_to_errno(enum nfs_stat status)
-nfs_stat_to_errno(int stat)
 {
        int i;
        for (i = 0; nfs_errtbl[i].stat != -1; i++) {
-                if (nfs_errtbl[i].stat == stat)
+                if (nfs_errtbl[i].stat == (int)status)
                        return nfs_errtbl[i].errno;
        }
-        dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
+        dprintk("NFS: Unrecognized nfs status value: %u\n", status);
        return nfs_errtbl[i].errno;
 }
 #define PROC(proc, argtype, restype, timer)                             \
 [NFSPROC_##proc] = {                                                    \
        .p_proc     =  NFSPROC_##proc,                                  \
-        .p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,                  \
+        .p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,             \
-        .p_decode   =  (kxdrproc_t) nfs_xdr_##restype,                  \
+        .p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,             \
        .p_arglen   =  NFS_##argtype##_sz,                              \
        .p_replen   =  NFS_##restype##_sz,                              \
        .p_timer    =  timer,                                           \
@@ -707,21 +1133,21 @@ nfs_stat_to_errno(int stat)
        .p_name     =  #proc,                                           \
        }
 struct rpc_procinfo     nfs_procedures[] = {
-    PROC(GETATTR,       fhandle,        attrstat, 1),
+        PROC(GETATTR,   fhandle,        attrstat,       1),
-    PROC(SETATTR,       sattrargs,      attrstat, 0),
+        PROC(SETATTR,   sattrargs,      attrstat,       0),
-    PROC(LOOKUP,        diropargs,      diropres, 2),
+        PROC(LOOKUP,    diropargs,      diropres,       2),
-    PROC(READLINK,      readlinkargs,   readlinkres, 3),
+        PROC(READLINK,  readlinkargs,   readlinkres,    3),
-    PROC(READ,          readargs,       readres, 3),
+        PROC(READ,      readargs,       readres,        3),
-    PROC(WRITE,         writeargs,      writeres, 4),
+        PROC(WRITE,     writeargs,      writeres,       4),
-    PROC(CREATE,        createargs,     diropres, 0),
+        PROC(CREATE,    createargs,     diropres,       0),
-    PROC(REMOVE,        removeargs,     stat, 0),
+        PROC(REMOVE,    removeargs,     stat,           0),
-    PROC(RENAME,        renameargs,     stat, 0),
+        PROC(RENAME,    renameargs,     stat,           0),
-    PROC(LINK,          linkargs,       stat, 0),
+        PROC(LINK,      linkargs,       stat,           0),
-    PROC(SYMLINK,       symlinkargs,    stat, 0),
+        PROC(SYMLINK,   symlinkargs,    stat,           0),
-    PROC(MKDIR,         createargs,     diropres, 0),
+        PROC(MKDIR,     createargs,     diropres,       0),
-    PROC(RMDIR,         diropargs,      stat, 0),
+        PROC(RMDIR,     diropargs,      stat,           0),
-    PROC(READDIR,       readdirargs,    readdirres, 3),
+        PROC(READDIR,   readdirargs,    readdirres,     3),
-    PROC(STATFS,        fhandle,        statfsres, 0),
+        PROC(STATFS,    fhandle,        statfsres,      0),
 };
 struct rpc_version              nfs_version2 = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index f6cc60f06dac..01c5e8b1941d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,18 +37,16 @@
 #define NFS3_filename_sz        (1+(NFS3_MAXNAMLEN>>2))
 #define NFS3_path_sz            (1+(NFS3_MAXPATHLEN>>2))
 #define NFS3_fattr_sz           (21)
-#define NFS3_wcc_attr_sz                (6)
+#define NFS3_cookieverf_sz      (NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz        (6)
 #define NFS3_pre_op_attr_sz     (1+NFS3_wcc_attr_sz)
 #define NFS3_post_op_attr_sz    (1+NFS3_fattr_sz)
-#define NFS3_wcc_data_sz                (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
+#define NFS3_wcc_data_sz        (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
-#define NFS3_fsstat_sz          
-#define NFS3_fsinfo_sz          
-#define NFS3_pathconf_sz                
-#define NFS3_entry_sz           (NFS3_filename_sz+3)
-#define NFS3_sattrargs_sz       (NFS3_fh_sz+NFS3_sattr_sz+3)
 #define NFS3_diropargs_sz       (NFS3_fh_sz+NFS3_filename_sz)
-#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_getattrargs_sz     (NFS3_fh_sz)
+#define NFS3_setattrargs_sz     (NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_accessargs_sz      (NFS3_fh_sz+1)
 #define NFS3_readlinkargs_sz    (NFS3_fh_sz)
 #define NFS3_readargs_sz        (NFS3_fh_sz+3)
@@ -57,14 +55,16 @@
 #define NFS3_mkdirargs_sz       (NFS3_diropargs_sz+NFS3_sattr_sz)
 #define NFS3_symlinkargs_sz     (NFS3_diropargs_sz+1+NFS3_sattr_sz)
 #define NFS3_mknodargs_sz       (NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_renameargs_sz      (NFS3_diropargs_sz+NFS3_diropargs_sz)
 #define NFS3_linkargs_sz                (NFS3_fh_sz+NFS3_diropargs_sz)
-#define NFS3_readdirargs_sz     (NFS3_fh_sz+2)
+#define NFS3_readdirargs_sz     (NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
 #define NFS3_commitargs_sz      (NFS3_fh_sz+3)
-#define NFS3_attrstat_sz        (1+NFS3_fattr_sz)
+#define NFS3_getattrres_sz      (1+NFS3_fattr_sz)
-#define NFS3_wccstat_sz         (1+NFS3_wcc_data_sz)
+#define NFS3_setattrres_sz      (1+NFS3_wcc_data_sz)
-#define NFS3_removeres_sz       (NFS3_wccstat_sz)
+#define NFS3_removeres_sz       (NFS3_setattrres_sz)
 #define NFS3_lookupres_sz       (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
 #define NFS3_accessres_sz       (1+NFS3_post_op_attr_sz+1)
 #define NFS3_readlinkres_sz     (1+NFS3_post_op_attr_sz+1)
@@ -100,1079 +100,2362 @@ static const umode_t nfs_type2fmt[] = {
        [NF3FIFO] = S_IFIFO,
 };
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
+{
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
                "Remaining buffer length is %tu words.\n",
                func, xdr->end - xdr->p);
 }
 /*
- * Common NFS XDR functions as inlines
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
 */
-static inline __be32 *
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh)
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
 {
-        return xdr_encode_array(p, fh->data, fh->size);
+        __be32 *p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
 }
-static inline __be32 *
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
 {
-        if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
+        __be32 *p;
-                memcpy(fh->data, p, fh->size);
-                return p + XDR_QUADLEN(fh->size);
+        p = xdr_inline_decode(xdr, 4);
-        }
+        if (unlikely(p == NULL))
-        return NULL;
+                goto out_overflow;
+        *value = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        xdr_decode_hyper(p, value);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * fileid3
+ *
+ *      typedef uint64 fileid3;
+ */
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+        return xdr_decode_hyper(p, fileid);
+}
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+        return decode_uint64(xdr, fileid);
+}
+/*
+ * filename3
+ *
+ *      typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+                             const char *name, u32 length)
+{
+        __be32 *p;
+        BUG_ON(length > NFS3_MAXNAMLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
 }
-static inline __be32 *
+static int decode_inline_filename3(struct xdr_stream *xdr,
-xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
+                                   const char **name, u32 *length)
 {
        __be32 *p;
+        u32 count;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
                goto out_overflow;
-        fh->size = ntohl(*p++);
+        *name = (const char *)p;
+        *length = count;
+        return 0;
-        if (fh->size <= NFS3_FHSIZE) {
+out_nametoolong:
-                p = xdr_inline_decode(xdr, fh->size);
+        dprintk("NFS: returned filename too long: %u\n", count);
-                if (unlikely(!p))
+        return -ENAMETOOLONG;
-                        goto out_overflow;
+out_overflow:
-                memcpy(fh->data, p, fh->size);
+        print_overflow_msg(__func__, xdr);
-                return p + XDR_QUADLEN(fh->size);
+        return -EIO;
-        }
+}
-        return NULL;
+/*
+ * nfspath3
+ *
+ *      typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+                            const u32 length)
+{
+        BUG_ON(length > NFS3_MAXPATHLEN);
+        encode_uint32(xdr, length);
+        xdr_write_pages(xdr, pages, 0, length);
+}
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+        u32 recvd, count;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+                goto out_nametoolong;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, count);
+        xdr_terminate_string(xdr->buf, count);
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned pathname too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "count %u > recvd %u\n", count, recvd);
+        return -EIO;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EIO;
 }
 /*
- * Encode/decode time.
+ * cookie3
+ *
+ *      typedef uint64 cookie3
 */
-static inline __be32 *
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
-xdr_encode_time3(__be32 *p, struct timespec *timep)
 {
-        *p++ = htonl(timep->tv_sec);
+        return xdr_encode_hyper(p, cookie);
-        *p++ = htonl(timep->tv_nsec);
-        return p;
 }
-static inline __be32 *
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
-xdr_decode_time3(__be32 *p, struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        return decode_uint64(xdr, cookie);
-        timep->tv_nsec = ntohl(*p++);
+}
-        return p;
+/*
+ * cookieverf3
+ *
+ *      typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+        memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+        return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * createverf3
+ *
+ *      typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+        memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_WRITEVERFSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * size3
+ *
+ *      typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+        return xdr_decode_hyper(p, size);
+}
+/*
+ * nfsstat3
+ *
+ *      enum nfsstat3 {
+ *              NFS3_OK = 0,
+ *              ...
+ *      }
+ */
+#define NFS3_OK         NFS_OK
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * ftype3
+ *
+ *      enum ftype3 {
+ *              NF3REG  = 1,
+ *              NF3DIR  = 2,
+ *              NF3BLK  = 3,
+ *              NF3CHR  = 4,
+ *              NF3LNK  = 5,
+ *              NF3SOCK = 6,
+ *              NF3FIFO = 7
+ *      };
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+        BUG_ON(type > NF3FIFO);
+        encode_uint32(xdr, type);
 }
-static __be32 *
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-        unsigned int    type, major, minor;
+        u32 type;
-        umode_t         fmode;
-        type = ntohl(*p++);
+        type = be32_to_cpup(p++);
        if (type > NF3FIFO)
                type = NF3NON;
-        fmode = nfs_type2fmt[type];
+        *mode = nfs_type2fmt[type];
-        fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
+        return p;
-        fattr->nlink = ntohl(*p++);
+}
-        fattr->uid = ntohl(*p++);
-        fattr->gid = ntohl(*p++);
-        p = xdr_decode_hyper(p, &fattr->size);
-        p = xdr_decode_hyper(p, &fattr->du.nfs3.used);
-        /* Turn remote device info into Linux-specific dev_t */
-        major = ntohl(*p++);
-        minor = ntohl(*p++);
-        fattr->rdev = MKDEV(major, minor);
-        if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
-                fattr->rdev = 0;
-        p = xdr_decode_hyper(p, &fattr->fsid.major);
+/*
-        fattr->fsid.minor = 0;
+ * specdata3
-        p = xdr_decode_hyper(p, &fattr->fileid);
+ *
-        p = xdr_decode_time3(p, &fattr->atime);
+ *     struct specdata3 {
-        p = xdr_decode_time3(p, &fattr->mtime);
+ *             uint32  specdata1;
-        p = xdr_decode_time3(p, &fattr->ctime);
+ *             uint32  specdata2;
+ *     };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+        __be32 *p;
-        /* Update the mode bits */
+        p = xdr_reserve_space(xdr, 8);
-        fattr->valid |= NFS_ATTR_FATTR_V3;
+        *p++ = cpu_to_be32(MAJOR(rdev));
+        *p = cpu_to_be32(MINOR(rdev));
+}
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+        unsigned int major, minor;
+        major = be32_to_cpup(p++);
+        minor = be32_to_cpup(p++);
+        *rdev = MKDEV(major, minor);
+        if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+                *rdev = 0;
+        return p;
+}
+/*
+ * nfs_fh3
+ *
+ *      struct nfs_fh3 {
+ *              opaque       data<NFS3_FHSIZE>;
+ *      };
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        __be32 *p;
+        BUG_ON(fh->size > NFS3_FHSIZE);
+        p = xdr_reserve_space(xdr, 4 + fh->size);
+        xdr_encode_opaque(p, fh->data, fh->size);
+}
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > NFS3_FHSIZE))
+                goto out_toobig;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fh->size = length;
+        memcpy(fh->data, p, length);
+        return 0;
+out_toobig:
+        dprintk("NFS: file handle size (%u) too big\n", length);
+        return -E2BIG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+        memset(fh, 0, sizeof(*fh));
+}
+/*
+ * nfstime3
+ *
+ *      struct nfstime3 {
+ *              uint32  seconds;
+ *              uint32  nseconds;
+ *      };
+ */
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        *p++ = cpu_to_be32(timep->tv_nsec);
        return p;
 }
-static inline __be32 *
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
+        timep->tv_sec = be32_to_cpup(p++);
+        timep->tv_nsec = be32_to_cpup(p++);
+        return p;
+}
+/*
+ * sattr3
+ *
+ *      enum time_how {
+ *              DONT_CHANGE             = 0,
+ *              SET_TO_SERVER_TIME      = 1,
+ *              SET_TO_CLIENT_TIME      = 2
+ *      };
+ *
+ *      union set_mode3 switch (bool set_it) {
+ *      case TRUE:
+ *              mode3   mode;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_uid3 switch (bool set_it) {
+ *      case TRUE:
+ *              uid3    uid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_gid3 switch (bool set_it) {
+ *      case TRUE:
+ *              gid3    gid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_size3 switch (bool set_it) {
+ *      case TRUE:
+ *              size3   size;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_atime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3        atime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_mtime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3  mtime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct sattr3 {
+ *              set_mode3       mode;
+ *              set_uid3        uid;
+ *              set_gid3        gid;
+ *              set_size3       size;
+ *              set_atime       atime;
+ *              set_mtime       mtime;
+ *      };
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
+{
+        u32 nbytes;
+        __be32 *p;
+        /*
+         * In order to make only a single xdr_reserve_space() call,
+         * pre-compute the total number of bytes to be reserved.
+         * Six boolean values, one for each set_foo field, are always
+         * present in the encoded result, so start there.
+         */
+        nbytes = 6 * 4;
+        if (attr->ia_valid & ATTR_MODE)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_UID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_GID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_SIZE)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_ATIME_SET)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_MTIME_SET)
+                nbytes += 8;
+        p = xdr_reserve_space(xdr, nbytes);
        if (attr->ia_valid & ATTR_MODE) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_mode & S_IALLUGO);
+                *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_UID) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_uid);
+                *p++ = cpu_to_be32(attr->ia_uid);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_GID) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_gid);
+                *p++ = cpu_to_be32(attr->ia_gid);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_SIZE) {
                *p++ = xdr_one;
-                p = xdr_encode_hyper(p, (__u64) attr->ia_size);
+                p = xdr_encode_hyper(p, (u64)attr->ia_size);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_ATIME_SET) {
                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_atime);
+                p = xdr_encode_nfstime3(p, &attr->ia_atime);
        } else if (attr->ia_valid & ATTR_ATIME) {
                *p++ = xdr_one;
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_MTIME_SET) {
                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_mtime);
+                xdr_encode_nfstime3(p, &attr->ia_mtime);
        } else if (attr->ia_valid & ATTR_MTIME) {
-                *p++ = xdr_one;
+                *p = xdr_one;
-        } else {
+        } else
-                *p++ = xdr_zero;
+                *p = xdr_zero;
-        }
+}
-        return p;
+/*
+ * fattr3
+ *
+ *      struct fattr3 {
+ *              ftype3          type;
+ *              mode3           mode;
+ *              uint32          nlink;
+ *              uid3            uid;
+ *              gid3            gid;
+ *              size3           size;
+ *              size3           used;
+ *              specdata3       rdev;
+ *              uint64          fsid;
+ *              fileid3         fileid;
+ *              nfstime3        atime;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        umode_t fmode;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_ftype3(p, &fmode);
+        fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &fattr->size);
+        p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+        p = xdr_decode_specdata3(p, &fattr->rdev);
+        p = xdr_decode_hyper(p, &fattr->fsid.major);
+        fattr->fsid.minor = 0;
+        p = xdr_decode_fileid3(p, &fattr->fileid);
+        p = xdr_decode_nfstime3(p, &fattr->atime);
+        p = xdr_decode_nfstime3(p, &fattr->mtime);
+        xdr_decode_nfstime3(p, &fattr->ctime);
+        fattr->valid |= NFS_ATTR_FATTR_V3;
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32 *
+/*
-xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
+ * post_op_attr
+ *
+ *      union post_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              fattr3  attributes;
+ *      case FALSE:
+ *              void;
+ *      };
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
-        p = xdr_decode_hyper(p, &fattr->pre_size);
+        __be32 *p;
-        p = xdr_decode_time3(p, &fattr->pre_mtime);
-        p = xdr_decode_time3(p, &fattr->pre_ctime);
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p != xdr_zero)
+                return decode_fattr3(xdr, fattr);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * wcc_attr
+ *      struct wcc_attr {
+ *              size3           size;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
        fattr->valid |= NFS_ATTR_FATTR_PRESIZE
                | NFS_ATTR_FATTR_PREMTIME
                | NFS_ATTR_FATTR_PRECTIME;
-        return p;
-}
-static inline __be32 *
+        p = xdr_decode_size3(p, &fattr->pre_size);
-xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
+        p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
-{
+        xdr_decode_nfstime3(p, &fattr->pre_ctime);
-        if (*p++)
-                p = xdr_decode_fattr(p, fattr);
+        return 0;
-        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32 *
+/*
-xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+ * pre_op_attr
+ *      union pre_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              wcc_attr        attributes;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ * wcc_data
+ *
+ *      struct wcc_data {
+ *              pre_op_attr     before;
+ *              post_op_attr    after;
+ *      };
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
        __be32 *p;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
-        if (ntohl(*p++)) {
+        if (*p != xdr_zero)
-                p = xdr_inline_decode(xdr, 84);
+                return decode_wcc_attr(xdr, fattr);
-                if (unlikely(!p))
+        return 0;
-                        goto out_overflow;
-                p = xdr_decode_fattr(p, fattr);
-        }
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EIO;
 }
-static inline __be32 *
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
-xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
-        if (*p++)
+        int error;
-                return xdr_decode_wcc_attr(p, fattr);
-        return p;
+        error = decode_pre_op_attr(xdr, fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, fattr);
+out:
+        return error;
 }
+/*
+ * post_op_fh3
+ *
+ *      union post_op_fh3 switch (bool handle_follows) {
+ *      case TRUE:
+ *              nfs_fh3  handle;
+ *      case FALSE:
+ *              void;
+ *      };
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        __be32 *p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p != xdr_zero)
+                return decode_nfs_fh3(xdr, fh);
+        zero_nfs_fh3(fh);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-static inline __be32 *
+/*
-xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
+ * diropargs3
+ *
+ *      struct diropargs3 {
+ *              nfs_fh3         dir;
+ *              filename3       name;
+ *      };
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+                              const char *name, u32 length)
 {
-        p = xdr_decode_pre_op_attr(p, fattr);
+        encode_nfs_fh3(xdr, fh);
-        return xdr_decode_post_op_attr(p, fattr);
+        encode_filename3(xdr, name, length);
 }
 /*
- * NFS encode functions
+ * NFSv3 XDR encode functions
+ *
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
 */
 /*
- * Encode file handle argument
+ * 3.3.1  GETATTR3args
+ *
+ *      struct GETATTR3args {
+ *              nfs_fh3  object;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
-nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
+                                      struct xdr_stream *xdr,
+                                      const struct nfs_fh *fh)
 {
-        p = xdr_encode_fhandle(p, fh);
+        encode_nfs_fh3(xdr, fh);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SETATTR arguments
+ * 3.3.2  SETATTR3args
+ *
+ *      union sattrguard3 switch (bool check) {
+ *      case TRUE:
+ *              nfstime3  obj_ctime;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ *      struct SETATTR3args {
+ *              nfs_fh3         object;
+ *              sattr3          new_attributes;
+ *              sattrguard3     guard;
+ *      };
 */
-static int
+static void encode_sattrguard3(struct xdr_stream *xdr,
-nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
+                               const struct nfs3_sattrargs *args)
-{
+{
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
-        p = xdr_encode_sattr(p, args->sattr);
-        *p++ = htonl(args->guard);
+        if (args->guard) {
-        if (args->guard)
+                p = xdr_reserve_space(xdr, 4 + 8);
-                p = xdr_encode_time3(p, &args->guardtime);
+                *p++ = xdr_one;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+                xdr_encode_nfstime3(p, &args->guardtime);
-        return 0;
+        } else {
+                p = xdr_reserve_space(xdr, 4);
+                *p = xdr_zero;
+        }
+}
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_sattrargs *args)
+{
+        encode_nfs_fh3(xdr, args->fh);
+        encode_sattr3(xdr, args->sattr);
+        encode_sattrguard3(xdr, args);
 }
 /*
- * Encode directory ops argument
+ * 3.3.3  LOOKUP3args
+ *
+ *      struct LOOKUP3args {
+ *              diropargs3  what;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
-nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_diropargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
-        p = xdr_encode_array(p, args->name, args->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode REMOVE argument
+ * 3.3.4  ACCESS3args
+ *
+ *      struct ACCESS3args {
+ *              nfs_fh3         object;
+ *              uint32          access;
+ *      };
 */
-static int
+static void encode_access3args(struct xdr_stream *xdr,
-nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+                               const struct nfs3_accessargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_array(p, args->name.name, args->name.len);
+        encode_uint32(xdr, args->access);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        return 0;
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_accessargs *args)
+{
+        encode_access3args(xdr, args);
 }
 /*
- * Encode access() argument
+ * 3.3.5  READLINK3args
+ *
+ *      struct READLINK3args {
+ *              nfs_fh3 symlink;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
-nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
+                                       struct xdr_stream *xdr,
+                                       const struct nfs3_readlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_nfs_fh3(xdr, args->fh);
-        *p++ = htonl(args->access);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+                                        args->pglen, NFS3_readlinkres_sz);
-        return 0;
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * 3.3.6  READ3args
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ *      struct READ3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_read3args(struct xdr_stream *xdr,
-nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+                             const struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        __be32 *p;
-        unsigned int replen;
-        u32 count = args->count;
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(count);
+        *p = cpu_to_be32(args->count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        /* Inline the page array */
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
+                                   struct xdr_stream *xdr,
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+                                   const struct nfs_readargs *args)
-                         args->pages, args->pgbase, count);
+{
+        encode_read3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->count, NFS3_readres_sz);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
 }
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 3.3.7  WRITE3args
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *              stable_how      stable;
+ *              opaque          data<>;
+ *      };
 */
-static int
+static void encode_write3args(struct xdr_stream *xdr,
-nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                              const struct nfs_writeargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        __be32 *p;
-        u32 count = args->count;
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(count);
+        *p++ = cpu_to_be32(args->count);
-        *p++ = htonl(args->stable);
+        *p++ = cpu_to_be32(args->stable);
-        *p++ = htonl(count);
+        *p = cpu_to_be32(args->count);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
+        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
-        /* Copy the page array */
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
-        sndbuf->flags |= XDRBUF_WRITE;
+                                    struct xdr_stream *xdr,
-        return 0;
+                                    const struct nfs_writeargs *args)
+{
+        encode_write3args(xdr, args);
+        xdr->buf->flags |= XDRBUF_WRITE;
 }
 /*
- * Encode CREATE arguments
+ * 3.3.8  CREATE3args
+ *
+ *      enum createmode3 {
+ *              UNCHECKED = 0,
+ *              GUARDED   = 1,
+ *              EXCLUSIVE = 2
+ *      };
+ *
+ *      union createhow3 switch (createmode3 mode) {
+ *      case UNCHECKED:
+ *      case GUARDED:
+ *              sattr3       obj_attributes;
+ *      case EXCLUSIVE:
+ *              createverf3  verf;
+ *      };
+ *
+ *      struct CREATE3args {
+ *              diropargs3      where;
+ *              createhow3      how;
+ *      };
 */
-static int
+static void encode_createhow3(struct xdr_stream *xdr,
-nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
+                              const struct nfs3_createargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_uint32(xdr, args->createmode);
-        p = xdr_encode_array(p, args->name, args->len);
+        switch (args->createmode) {
+        case NFS3_CREATE_UNCHECKED:
-        *p++ = htonl(args->createmode);
+        case NFS3_CREATE_GUARDED:
-        if (args->createmode == NFS3_CREATE_EXCLUSIVE) {
+                encode_sattr3(xdr, args->sattr);
-                *p++ = args->verifier[0];
+                break;
-                *p++ = args->verifier[1];
+        case NFS3_CREATE_EXCLUSIVE:
-        } else
+                encode_createverf3(xdr, args->verifier);
-                p = xdr_encode_sattr(p, args->sattr);
+                break;
+        default:
+                BUG();
+        }
+}
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
-        return 0;
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_createargs *args)
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_createhow3(xdr, args);
 }
 /*
- * Encode MKDIR arguments
+ * 3.3.9  MKDIR3args
+ *
+ *      struct MKDIR3args {
+ *              diropargs3      where;
+ *              sattr3          attributes;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
-nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mkdirargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SYMLINK arguments
+ * 3.3.10  SYMLINK3args
+ *
+ *      struct symlinkdata3 {
+ *              sattr3          symlink_attributes;
+ *              nfspath3        symlink_data;
+ *      };
+ *
+ *      struct SYMLINK3args {
+ *              diropargs3      where;
+ *              symlinkdata3    symlink;
+ *      };
 */
-static int
+static void encode_symlinkdata3(struct xdr_stream *xdr,
-nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
+                                const struct nfs3_symlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        encode_nfspath3(xdr, args->pages, args->pathlen);
-        p = xdr_encode_sattr(p, args->sattr);
+}
-        *p++ = htonl(args->pathlen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Copy the page */
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
-        xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen);
+                                      struct xdr_stream *xdr,
-        return 0;
+                                      const struct nfs3_symlinkargs *args)
+{
+        encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+        encode_symlinkdata3(xdr, args);
 }
 /*
- * Encode MKNOD arguments
+ * 3.3.11  MKNOD3args
+ *
+ *      struct devicedata3 {
+ *              sattr3          dev_attributes;
+ *              specdata3       spec;
+ *      };
+ *
+ *      union mknoddata3 switch (ftype3 type) {
+ *      case NF3CHR:
+ *      case NF3BLK:
+ *              devicedata3     device;
+ *      case NF3SOCK:
+ *      case NF3FIFO:
+ *              sattr3          pipe_attributes;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct MKNOD3args {
+ *              diropargs3      where;
+ *              mknoddata3      what;
+ *      };
 */
-static int
+static void encode_devicedata3(struct xdr_stream *xdr,
-nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
+                               const struct nfs3_mknodargs *args)
-{
+{
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_specdata3(xdr, args->rdev);
-        *p++ = htonl(args->type);
+}
-        p = xdr_encode_sattr(p, args->sattr);
-        if (args->type == NF3CHR || args->type == NF3BLK) {
+static void encode_mknoddata3(struct xdr_stream *xdr,
-                *p++ = htonl(MAJOR(args->rdev));
+                              const struct nfs3_mknodargs *args)
-                *p++ = htonl(MINOR(args->rdev));
+{
+        encode_ftype3(xdr, args->type);
+        switch (args->type) {
+        case NF3CHR:
+        case NF3BLK:
+                encode_devicedata3(xdr, args);
+                break;
+        case NF3SOCK:
+        case NF3FIFO:
+                encode_sattr3(xdr, args->sattr);
+                break;
+        case NF3REG:
+        case NF3DIR:
+                break;
+        default:
+                BUG();
        }
+}
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
-        return 0;
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mknodargs *args)
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_mknoddata3(xdr, args);
 }
 /*
- * Encode RENAME arguments
+ * 3.3.12  REMOVE3args
+ *
+ *      struct REMOVE3args {
+ *              diropargs3  object;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+                                     struct xdr_stream *xdr,
-{
+                                     const struct nfs_removeargs *args)
-        p = xdr_encode_fhandle(p, args->old_dir);
+{
-        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+        encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
-        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode LINK arguments
+ * 3.3.14  RENAME3args
+ *
+ *      struct RENAME3args {
+ *              diropargs3      from;
+ *              diropargs3      to;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
-nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        const struct qstr *old = args->old_name;
-        p = xdr_encode_fhandle(p, args->tofh);
+        const struct qstr *new = args->new_name;
-        p = xdr_encode_array(p, args->toname, args->tolen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        encode_diropargs3(xdr, args->old_dir, old->name, old->len);
-        return 0;
+        encode_diropargs3(xdr, args->new_dir, new->name, new->len);
 }
 /*
- * Encode arguments to readdir call
+ * 3.3.15  LINK3args
+ *
+ *      struct LINK3args {
+ *              nfs_fh3         file;
+ *              diropargs3      link;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
-nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
+                                   struct xdr_stream *xdr,
+                                   const struct nfs3_linkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_nfs_fh3(xdr, args->fromfh);
-        unsigned int replen;
+        encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
-        u32 count = args->count;
-        p = xdr_encode_fhandle(p, args->fh);
-        p = xdr_encode_hyper(p, args->cookie);
-        *p++ = args->verf[0];
-        *p++ = args->verf[1];
-        if (args->plus) {
-                /* readdirplus: need dircount + buffer size.
-                 * We just make sure we make dircount big enough */
-                *p++ = htonl(count >> 3);
-        }
-        *p++ = htonl(count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
-        return 0;
 }
 /*
- * Decode the result of a readdir call.
+ * 3.3.16  READDIR3args
- * We just check for syntactical correctness.
+ *
+ *      struct READDIR3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_readdir3args(struct xdr_stream *xdr,
-nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
+                                const struct nfs3_readdirargs *args)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        struct page **page;
-        size_t hdrlen;
-        u32 recvd, pglen;
-        int status;
-        status = ntohl(*p++);
-        /* Decode post_op_attrs */
-        p = xdr_decode_post_op_attr(p, res->dir_attr);
-        if (status)
-                return nfs_stat_to_errno(status);
-        /* Decode verifier cookie */
-        if (res->verf) {
-                res->verf[0] = *p++;
-                res->verf[1] = *p++;
-        } else {
-                p += 2;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        encode_nfs_fh3(xdr, args->fh);
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READDIR reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
-        recvd = rcvbuf->len - hdrlen;
+        p = xdr_encode_cookie3(p, args->cookie);
-        if (pglen > recvd)
+        p = xdr_encode_cookieverf3(p, args->verf);
-                pglen = recvd;
+        *p = cpu_to_be32(args->count);
-        page = rcvbuf->pages;
+}
-        return pglen;
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_readdirargs *args)
+{
+        encode_readdir3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
-__be32 *
+/*
-nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+ * 3.3.17  READDIRPLUS3args
+ *
+ *      struct READDIRPLUS3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          dircount;
+ *              count3          maxcount;
+ *      };
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+                                    const struct nfs3_readdirargs *args)
 {
        __be32 *p;
-        struct nfs_entry old = *entry;
-        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
-                goto out_overflow;
-        if (!ntohl(*p++)) {
-                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
-                        goto out_overflow;
-                if (!ntohl(*p++))
-                        return ERR_PTR(-EAGAIN);
-                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
-        }
-        p = xdr_inline_decode(xdr, 12);
+        encode_nfs_fh3(xdr, args->fh);
-        if (unlikely(!p))
-                goto out_overflow;
-        p = xdr_decode_hyper(p, &entry->ino);
-        entry->len  = ntohl(*p++);
-        p = xdr_inline_decode(xdr, entry->len + 8);
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
-        if (unlikely(!p))
+        p = xdr_encode_cookie3(p, args->cookie);
-                goto out_overflow;
+        p = xdr_encode_cookieverf3(p, args->verf);
-        entry->name = (const char *) p;
-        p += XDR_QUADLEN(entry->len);
-        entry->prev_cookie = entry->cookie;
-        p = xdr_decode_hyper(p, &entry->cookie);
-        entry->d_type = DT_UNKNOWN;
-        if (plus) {
-                entry->fattr->valid = 0;
-                p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
-                if (IS_ERR(p))
-                        goto out_overflow_exit;
-                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-                /* In fact, a post_op_fh3: */
-                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
-                        goto out_overflow;
-                if (*p++) {
-                        p = xdr_decode_fhandle_stream(xdr, entry->fh);
-                        if (IS_ERR(p))
-                                goto out_overflow_exit;
-                        /* Ugh -- server reply was truncated */
-                        if (p == NULL) {
-                                dprintk("NFS: FH truncated\n");
-                                *entry = old;
-                                return ERR_PTR(-EAGAIN);
-                        }
-                } else
-                        memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
-        }
-        p = xdr_inline_peek(xdr, 8);
+        /*
-        if (p != NULL)
+         * readdirplus: need dircount + buffer size.
-                entry->eof = !p[0] && p[1];
+         * We just make sure we make dircount big enough
-        else
+         */
-                entry->eof = 0;
+        *p++ = cpu_to_be32(args->count >> 3);
-        return p;
+        *p = cpu_to_be32(args->count);
+}
-out_overflow:
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
-        print_overflow_msg(__func__, xdr);
+                                          struct xdr_stream *xdr,
-out_overflow_exit:
+                                          const struct nfs3_readdirargs *args)
-        return ERR_PTR(-EAGAIN);
+{
+        encode_readdirplus3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
 /*
- * Encode COMMIT arguments
+ * 3.3.21  COMMIT3args
+ *
+ *      struct COMMIT3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_commit3args(struct xdr_stream *xdr,
-nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                               const struct nfs_writeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
+        encode_nfs_fh3(xdr, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(args->count);
+        *p = cpu_to_be32(args->count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
-#ifdef CONFIG_NFS_V3_ACL
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
-/*
+                                     struct xdr_stream *xdr,
- * Encode GETACL arguments
+                                     const struct nfs_writeargs *args)
- */
-static int
-nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
-                    struct nfs3_getaclargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_commit3args(xdr, args);
-        unsigned int replen;
+}
-        p = xdr_encode_fhandle(p, args->fh);
+#ifdef CONFIG_NFS_V3_ACL
-        *p++ = htonl(args->mask);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        if (args->mask & (NFS_ACL | NFS_DFACL)) {
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
-                /* Inline the page array */
+                                     struct xdr_stream *xdr,
-                replen = (RPC_REPHDRSIZE + auth->au_rslack +
+                                     const struct nfs3_getaclargs *args)
-                          ACL3_getaclres_sz) << 2;
+{
-                xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0,
+        encode_nfs_fh3(xdr, args->fh);
-                                 NFSACL_MAXPAGES << PAGE_SHIFT);
+        encode_uint32(xdr, args->mask);
-        }
+        if (args->mask & (NFS_ACL | NFS_DFACL))
-        return 0;
+                prepare_reply_buffer(req, args->pages, 0,
+                                        NFSACL_MAXPAGES << PAGE_SHIFT,
+                                        ACL3_getaclres_sz);
 }
-/*
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
- * Encode SETACL arguments
+                                     struct xdr_stream *xdr,
- */
+                                     const struct nfs3_setaclargs *args)
-static int
-nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
-                   struct nfs3_setaclargs *args)
 {
-        struct xdr_buf *buf = &req->rq_snd_buf;
        unsigned int base;
-        int err;
+        int error;
-        p = xdr_encode_fhandle(p, NFS_FH(args->inode));
-        *p++ = htonl(args->mask);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        base = req->rq_slen;
+        encode_nfs_fh3(xdr, NFS_FH(args->inode));
+        encode_uint32(xdr, args->mask);
        if (args->npages != 0)
-                xdr_encode_pages(buf, args->pages, 0, args->len);
+                xdr_write_pages(xdr, args->pages, 0, args->len);
-        else
-                req->rq_slen = xdr_adjust_iovec(req->rq_svec,
-                                p + XDR_QUADLEN(args->len));
-        err = nfsacl_encode(buf, base, args->inode,
+        base = req->rq_slen;
+        error = nfsacl_encode(xdr->buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
                            args->acl_access : NULL, 1, 0);
-        if (err > 0)
+        BUG_ON(error < 0);
-                err = nfsacl_encode(buf, base + err, args->inode,
+        error = nfsacl_encode(xdr->buf, base + error, args->inode,
-                                    (args->mask & NFS_DFACL) ?
+                            (args->mask & NFS_DFACL) ?
-                                    args->acl_default : NULL, 1,
+                            args->acl_default : NULL, 1,
-                                    NFS_ACL_DEFAULT);
+                            NFS_ACL_DEFAULT);
-        return (err > 0) ? 0 : err;
+        BUG_ON(error < 0);
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 /*
- * NFS XDR decode functions
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
 */
 /*
- * Decode attrstat reply.
+ * 3.3.1  GETATTR3res
+ *
+ *      struct GETATTR3resok {
+ *              fattr3          obj_attributes;
+ *      };
+ *
+ *      union GETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              GETATTR3resok  resok;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
-nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        xdr_decode_fattr(p, fattr);
+        if (unlikely(error))
-        return 0;
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_fattr3(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode status+wcc_data reply
+ * 3.3.2  SETATTR3res
- * SATTR, REMOVE, RMDIR
+ *
+ *      struct SETATTR3resok {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      struct SETATTR3resfail {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      union SETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              SETATTR3resok   resok;
+ *      default:
+ *              SETATTR3resfail resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
-nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++)))
-                status = nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        xdr_decode_wcc_data(p, fattr);
+        if (unlikely(error))
-        return status;
+                goto out;
+        error = decode_wcc_data(xdr, result);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
-static int
+/*
-nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
+ * 3.3.3  LOOKUP3res
+ *
+ *      struct LOOKUP3resok {
+ *              nfs_fh3         object;
+ *              post_op_attr    obj_attributes;
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      struct LOOKUP3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union LOOKUP3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LOOKUP3resok    resok;
+ *      default:
+ *              LOOKUP3resfail  resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_diropres *result)
 {
-        return nfs3_xdr_wccstat(req, p, res->dir_attr);
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_nfs_fh3(xdr, result->fh);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->dir_attr);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode LOOKUP reply
+ * 3.3.4  ACCESS3res
+ *
+ *      struct ACCESS3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          access;
+ *      };
+ *
+ *      struct ACCESS3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union ACCESS3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              ACCESS3resok    resok;
+ *      default:
+ *              ACCESS3resfail  resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
-nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_accessres *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++))) {
-                status = nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        } else {
+        if (unlikely(error))
-                if (!(p = xdr_decode_fhandle(p, res->fh)))
+                goto out;
-                        return -errno_NFSERR_IO;
+        error = decode_post_op_attr(xdr, result->fattr);
-                p = xdr_decode_post_op_attr(p, res->fattr);
+        if (unlikely(error))
-        }
+                goto out;
-        xdr_decode_post_op_attr(p, res->dir_attr);
+        if (status != NFS3_OK)
-        return status;
+                goto out_default;
+        error = decode_uint32(xdr, &result->access);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode ACCESS reply
+ * 3.3.5  READLINK3res
+ *
+ *      struct READLINK3resok {
+ *              post_op_attr    symlink_attributes;
+ *              nfspath3        data;
+ *      };
+ *
+ *      struct READLINK3resfail {
+ *              post_op_attr    symlink_attributes;
+ *      };
+ *
+ *      union READLINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READLINK3resok  resok;
+ *      default:
+ *              READLINK3resfail resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
-nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
+                                     struct xdr_stream *xdr,
+                                     struct nfs_fattr *result)
 {
-        int     status = ntohl(*p++);
+        enum nfs_stat status;
+        int error;
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status)
+        error = decode_nfsstat3(xdr, &status);
-                return nfs_stat_to_errno(status);
+        if (unlikely(error))
-        res->access = ntohl(*p++);
+                goto out;
-        return 0;
+        error = decode_post_op_attr(xdr, result);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_nfspath3(xdr);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-static int
+/*
-nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
+ * 3.3.6  READ3res
+ *
+ *      struct READ3resok {
+ *              post_op_attr    file_attributes;
+ *              count3          count;
+ *              bool            eof;
+ *              opaque          data<>;
+ *      };
+ *
+ *      struct READ3resfail {
+ *              post_op_attr    file_attributes;
+ *      };
+ *
+ *      union READ3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READ3resok      resok;
+ *      default:
+ *              READ3resfail    resfail;
+ *      };
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+                             struct nfs_readres *result)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        u32 eof, count, ocount, recvd;
-        unsigned int replen;
+        size_t hdrlen;
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_inline_decode(xdr, 4 + 4 + 4);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p++);
+        eof = be32_to_cpup(p++);
+        ocount = be32_to_cpup(p++);
+        if (unlikely(ocount != count))
+                goto out_mismatch;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = eof;
+        result->count = count;
+        return count;
+out_mismatch:
+        dprintk("NFS: READ count doesn't match length of opaque: "
+                "count %u != ocount %u\n", count, ocount);
+        return -EIO;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        eof = 0;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-        /* Inline the page array */
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2;
+                                 struct nfs_readres *result)
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
+{
-        return 0;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_read3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READLINK reply
+ * 3.3.7  WRITE3res
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3resok {
+ *              wcc_data        file_wcc;
+ *              count3          count;
+ *              stable_how      committed;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct WRITE3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union WRITE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              WRITE3resok     resok;
+ *      default:
+ *              WRITE3resfail   resfail;
+ *      };
 */
-static int
+static int decode_write3resok(struct xdr_stream *xdr,
-nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                              struct nfs_writeres *result)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        size_t hdrlen;
-        u32 len, recvd;
-        int     status;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        /* Convert length of symlink */
-        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len) {
-                dprintk("nfs: server returned giant symlink!\n");
-                return -ENAMETOOLONG;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
-        if (iov->iov_len < hdrlen) {
+        if (unlikely(p == NULL))
-                dprintk("NFS: READLINK reply header overflowed:"
+                goto out_overflow;
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+        result->count = be32_to_cpup(p++);
-                return -errno_NFSERR_IO;
+        result->verf->committed = be32_to_cpup(p++);
-        } else if (iov->iov_len != hdrlen) {
+        if (unlikely(result->verf->committed > NFS_FILE_SYNC))
-                dprintk("NFS: READLINK header is short. "
+                goto out_badvalue;
-                        "iovec will be shifted.\n");
+        memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
+        return result->count;
-        }
+out_badvalue:
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
-        if (recvd < len) {
+        return -EIO;
-                dprintk("NFS: server cheating in readlink reply: "
+out_overflow:
-                                "count %u > recvd %u\n", len, recvd);
+        print_overflow_msg(__func__, xdr);
-                return -EIO;
+        return -EIO;
-        }
+}
-        xdr_terminate_string(rcvbuf, len);
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-        return 0;
+                                  struct nfs_writeres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_write3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READ reply
+ * 3.3.8  CREATE3res
+ *
+ *      struct CREATE3resok {
+ *              post_op_fh3     obj;
+ *              post_op_attr    obj_attributes;
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      struct CREATE3resfail {
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      union CREATE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              CREATE3resok    resok;
+ *      default:
+ *              CREATE3resfail  resfail;
+ *      };
 */
-static int
+static int decode_create3resok(struct xdr_stream *xdr,
-nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+                               struct nfs3_diropres *result)
 {
-        struct kvec *iov = req->rq_rcv_buf.head;
+        int error;
-        size_t hdrlen;
-        u32 count, ocount, recvd;
+        error = decode_post_op_fh3(xdr, result->fh);
-        int status;
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        /* The server isn't required to return a file handle.
+         * If it didn't, force the client to perform a LOOKUP
+         * to determine the correct file handle and attribute
+         * values for the new object. */
+        if (result->fh->size == 0)
+                result->fattr->valid = 0;
+        error = decode_wcc_data(xdr, result->dir_attr);
+out:
+        return error;
+}
-        status = ntohl(*p++);
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
-        p = xdr_decode_post_op_attr(p, res->fattr);
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_diropres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_create3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
+}
-        if (status != 0)
+/*
-                return nfs_stat_to_errno(status);
+ * 3.3.12  REMOVE3res
+ *
+ *      struct REMOVE3resok {
+ *              wcc_data    dir_wcc;
+ *      };
+ *
+ *      struct REMOVE3resfail {
+ *              wcc_data    dir_wcc;
+ *      };
+ *
+ *      union REMOVE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              REMOVE3resok   resok;
+ *      default:
+ *              REMOVE3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_removeres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        /* Decode reply count and EOF flag. NFSv3 is somewhat redundant
+/*
-         * in that it puts the count both in the res struct and in the
+ * 3.3.14  RENAME3res
-         * opaque data count. */
+ *
-        count    = ntohl(*p++);
+ *      struct RENAME3resok {
-        res->eof = ntohl(*p++);
+ *              wcc_data        fromdir_wcc;
-        ocount   = ntohl(*p++);
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      struct RENAME3resfail {
+ *              wcc_data        fromdir_wcc;
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      union RENAME3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              RENAME3resok   resok;
+ *      default:
+ *              RENAME3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_renameres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->old_fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->new_fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        if (ocount != count) {
+/*
-                dprintk("NFS: READ count doesn't match RPC opaque count.\n");
+ * 3.3.15  LINK3res
-                return -errno_NFSERR_IO;
+ *
-        }
+ *      struct LINK3resok {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      struct LINK3resfail {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      union LINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LINK3resok      resok;
+ *      default:
+ *              LINK3resfail    resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs3_linkres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+/**
-        if (iov->iov_len < hdrlen) {
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
-                dprintk("NFS: READ reply header overflowed:"
+ *                      the local page cache
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+ * @xdr: XDR stream where entry resides
-                return -errno_NFSERR_IO;
+ * @entry: buffer to fill in with entry data
-        } else if (iov->iov_len != hdrlen) {
+ * @plus: boolean indicating whether this should be a readdirplus entry
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
+ *
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
+ * Returns zero if successful, otherwise a negative errno value is
-        }
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16  entry3
+ *
+ *      struct entry3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              fhandle3        filehandle;
+ *              post_op_attr3   attributes;
+ *              entry3          *nextentry;
+ *      };
+ *
+ * 3.3.17  entryplus3
+ *      struct entryplus3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              post_op_attr    name_attributes;
+ *              post_op_fh3     name_handle;
+ *              entryplus3      *nextentry;
+ *      };
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
+{
+        struct nfs_entry old = *entry;
+        __be32 *p;
+        int error;
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        p = xdr_inline_decode(xdr, 4);
-        if (count > recvd) {
+        if (unlikely(p == NULL))
-                dprintk("NFS: server cheating in read reply: "
+                goto out_overflow;
-                        "count %u > recvd %u\n", count, recvd);
+        if (*p == xdr_zero) {
-                count = recvd;
+                p = xdr_inline_decode(xdr, 4);
-                res->eof = 0;
+                if (unlikely(p == NULL))
+                        goto out_overflow;
+                if (*p == xdr_zero)
+                        return -EAGAIN;
+                entry->eof = 1;
+                return -EBADCOOKIE;
        }
-        if (count < res->count)
+        error = decode_fileid3(xdr, &entry->ino);
-                res->count = count;
+        if (unlikely(error))
+                return error;
-        return count;
+        error = decode_inline_filename3(xdr, &entry->name, &entry->len);
-}
+        if (unlikely(error))
+                return error;
-/*
+        entry->prev_cookie = entry->cookie;
- * Decode WRITE response
+        error = decode_cookie3(xdr, &entry->cookie);
- */
+        if (unlikely(error))
-static int
+                return error;
-nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
-{
-        int     status;
-        status = ntohl(*p++);
+        entry->d_type = DT_UNKNOWN;
-        p = xdr_decode_wcc_data(p, res->fattr);
-        if (status != 0)
+        if (plus) {
-                return nfs_stat_to_errno(status);
+                entry->fattr->valid = 0;
+                error = decode_post_op_attr(xdr, entry->fattr);
+                if (unlikely(error))
+                        return error;
+                if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+                        entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-        res->count = ntohl(*p++);
+                /* In fact, a post_op_fh3: */
-        res->verf->committed = (enum nfs3_stable_how)ntohl(*p++);
+                p = xdr_inline_decode(xdr, 4);
-        res->verf->verifier[0] = *p++;
+                if (unlikely(p == NULL))
-        res->verf->verifier[1] = *p++;
+                        goto out_overflow;
+                if (*p != xdr_zero) {
+                        error = decode_nfs_fh3(xdr, entry->fh);
+                        if (unlikely(error)) {
+                                if (error == -E2BIG)
+                                        goto out_truncated;
+                                return error;
+                        }
+                } else
+                        zero_nfs_fh3(entry->fh);
+        }
-        return res->count;
+        return 0;
-}
-/*
+out_overflow:
- * Decode a CREATE response
+        print_overflow_msg(__func__, xdr);
- */
+        return -EAGAIN;
-static int
+out_truncated:
-nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+        dprintk("NFS: directory entry contains invalid file handle\n");
-{
+        *entry = old;
-        int     status;
+        return -EAGAIN;
-        status = ntohl(*p++);
-        if (status == 0) {
-                if (*p++) {
-                        if (!(p = xdr_decode_fhandle(p, res->fh)))
-                                return -errno_NFSERR_IO;
-                        p = xdr_decode_post_op_attr(p, res->fattr);
-                } else {
-                        memset(res->fh, 0, sizeof(*res->fh));
-                        /* Do decode post_op_attr but set it to NULL */
-                        p = xdr_decode_post_op_attr(p, res->fattr);
-                        res->fattr->valid = 0;
-                }
-        } else {
-                status = nfs_stat_to_errno(status);
-        }
-        p = xdr_decode_wcc_data(p, res->dir_attr);
-        return status;
 }
 /*
- * Decode RENAME reply
+ * 3.3.16  READDIR3res
+ *
+ *      struct dirlist3 {
+ *              entry3          *entries;
+ *              bool            eof;
+ *      };
+ *
+ *      struct READDIR3resok {
+ *              post_op_attr    dir_attributes;
+ *              cookieverf3     cookieverf;
+ *              dirlist3        reply;
+ *      };
+ *
+ *      struct READDIR3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union READDIR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READDIR3resok   resok;
+ *      default:
+ *              READDIR3resfail resfail;
+ *      };
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them.  The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
 */
-static int
+static int decode_dirlist3(struct xdr_stream *xdr)
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
 {
-        int     status;
+        u32 recvd, pglen;
+        size_t hdrlen;
-        if ((status = ntohl(*p++)) != 0)
+        pglen = xdr->buf->page_len;
-                status = nfs_stat_to_errno(status);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        p = xdr_decode_wcc_data(p, res->old_fattr);
+        recvd = xdr->buf->len - hdrlen;
-        p = xdr_decode_wcc_data(p, res->new_fattr);
+        if (unlikely(pglen > recvd))
-        return status;
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
 }
-/*
+static int decode_readdir3resok(struct xdr_stream *xdr,
- * Decode LINK reply
+                                struct nfs3_readdirres *result)
- */
-static int
-nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
 {
-        int     status;
+        int error;
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        /* XXX: do we need to check if result->verf != NULL ? */
+        error = decode_cookieverf3(xdr, result->verf);
+        if (unlikely(error))
+                goto out;
+        error = decode_dirlist3(xdr);
+out:
+        return error;
+}
-        if ((status = ntohl(*p++)) != 0)
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
-                status = nfs_stat_to_errno(status);
+                                    struct xdr_stream *xdr,
-        p = xdr_decode_post_op_attr(p, res->fattr);
+                                    struct nfs3_readdirres *result)
-        p = xdr_decode_wcc_data(p, res->dir_attr);
+{
-        return status;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_readdir3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSSTAT reply
+ * 3.3.18  FSSTAT3res
+ *
+ *      struct FSSTAT3resok {
+ *              post_op_attr    obj_attributes;
+ *              size3           tbytes;
+ *              size3           fbytes;
+ *              size3           abytes;
+ *              size3           tfiles;
+ *              size3           ffiles;
+ *              size3           afiles;
+ *              uint32          invarsec;
+ *      };
+ *
+ *      struct FSSTAT3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSSTAT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSSTAT3resok    resok;
+ *      default:
+ *              FSSTAT3resfail  resfail;
+ *      };
 */
-static int
+static int decode_fsstat3resok(struct xdr_stream *xdr,
-nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
+                               struct nfs_fsstat *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_hyper(p, &res->tbytes);
-        p = xdr_decode_hyper(p, &res->fbytes);
-        p = xdr_decode_hyper(p, &res->abytes);
-        p = xdr_decode_hyper(p, &res->tfiles);
-        p = xdr_decode_hyper(p, &res->ffiles);
-        p = xdr_decode_hyper(p, &res->afiles);
+        p = xdr_inline_decode(xdr, 8 * 6 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_size3(p, &result->tbytes);
+        p = xdr_decode_size3(p, &result->fbytes);
+        p = xdr_decode_size3(p, &result->abytes);
+        p = xdr_decode_size3(p, &result->tfiles);
+        p = xdr_decode_size3(p, &result->ffiles);
+        xdr_decode_size3(p, &result->afiles);
        /* ignore invarsec */
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fsstat *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsstat3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSINFO reply
+ * 3.3.19  FSINFO3res
+ *
+ *      struct FSINFO3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          rtmax;
+ *              uint32          rtpref;
+ *              uint32          rtmult;
+ *              uint32          wtmax;
+ *              uint32          wtpref;
+ *              uint32          wtmult;
+ *              uint32          dtpref;
+ *              size3           maxfilesize;
+ *              nfstime3        time_delta;
+ *              uint32          properties;
+ *      };
+ *
+ *      struct FSINFO3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSINFO3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSINFO3resok    resok;
+ *      default:
+ *              FSINFO3resfail  resfail;
+ *      };
 */
-static int
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
-nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
+                               struct nfs_fsinfo *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        res->rtmax  = ntohl(*p++);
+        p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
-        res->rtpref = ntohl(*p++);
+        if (unlikely(p == NULL))
-        res->rtmult = ntohl(*p++);
+                goto out_overflow;
-        res->wtmax  = ntohl(*p++);
+        result->rtmax  = be32_to_cpup(p++);
-        res->wtpref = ntohl(*p++);
+        result->rtpref = be32_to_cpup(p++);
-        res->wtmult = ntohl(*p++);
+        result->rtmult = be32_to_cpup(p++);
-        res->dtpref = ntohl(*p++);
+        result->wtmax  = be32_to_cpup(p++);
-        p = xdr_decode_hyper(p, &res->maxfilesize);
+        result->wtpref = be32_to_cpup(p++);
-        p = xdr_decode_time3(p, &res->time_delta);
+        result->wtmult = be32_to_cpup(p++);
+        result->dtpref = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &result->maxfilesize);
+        xdr_decode_nfstime3(p, &result->time_delta);
        /* ignore properties */
-        res->lease_time = 0;
+        result->lease_time = 0;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fsinfo *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsinfo3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode PATHCONF reply
+ * 3.3.20  PATHCONF3res
+ *
+ *      struct PATHCONF3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          linkmax;
+ *              uint32          name_max;
+ *              bool            no_trunc;
+ *              bool            chown_restricted;
+ *              bool            case_insensitive;
+ *              bool            case_preserving;
+ *      };
+ *
+ *      struct PATHCONF3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union PATHCONF3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              PATHCONF3resok  resok;
+ *      default:
+ *              PATHCONF3resfail resfail;
+ *      };
 */
-static int
+static int decode_pathconf3resok(struct xdr_stream *xdr,
-nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
+                                 struct nfs_pathconf *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        res->max_link = ntohl(*p++);
-        res->max_namelen = ntohl(*p++);
+        p = xdr_inline_decode(xdr, 4 * 6);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->max_link = be32_to_cpup(p++);
+        result->max_namelen = be32_to_cpup(p);
        /* ignore remaining fields */
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_pathconf *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_pathconf3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode COMMIT reply
+ * 3.3.21  COMMIT3res
+ *
+ *      struct COMMIT3resok {
+ *              wcc_data        file_wcc;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct COMMIT3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union COMMIT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              COMMIT3resok    resok;
+ *      default:
+ *              COMMIT3resfail  resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
-nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
+                                   struct xdr_stream *xdr,
+                                   struct nfs_writeres *result)
 {
-        int             status;
+        enum nfs_stat status;
+        int error;
-        status = ntohl(*p++);
-        p = xdr_decode_wcc_data(p, res->fattr);
+        error = decode_nfsstat3(xdr, &status);
-        if (status != 0)
+        if (unlikely(error))
-                return nfs_stat_to_errno(status);
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
-        res->verf->verifier[0] = *p++;
+        if (unlikely(error))
-        res->verf->verifier[1] = *p++;
+                goto out;
-        return 0;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_writeverf3(xdr, result->verf->verifier);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 #ifdef CONFIG_NFS_V3_ACL
-/*
- * Decode GETACL reply
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
- */
+                                      struct nfs3_getaclres *result)
-static int
-nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
-                   struct nfs3_getaclres *res)
 {
-        struct xdr_buf *buf = &req->rq_rcv_buf;
-        int status = ntohl(*p++);
        struct posix_acl **acl;
        unsigned int *aclcnt;
-        int err, base;
+        size_t hdrlen;
+        int error;
-        if (status != 0)
-                return nfs_stat_to_errno(status);
+        error = decode_post_op_attr(xdr, result->fattr);
-        p = xdr_decode_post_op_attr(p, res->fattr);
+        if (unlikely(error))
-        res->mask = ntohl(*p++);
+                goto out;
-        if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+        error = decode_uint32(xdr, &result->mask);
-                return -EINVAL;
+        if (unlikely(error))
-        base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base;
+                goto out;
+        error = -EINVAL;
-        acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL;
+        if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
-        aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL;
+                goto out;
-        err = nfsacl_decode(buf, base, aclcnt, acl);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL;
-        aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL;
+        acl = NULL;
-        if (err > 0)
+        if (result->mask & NFS_ACL)
-                err = nfsacl_decode(buf, base + err, aclcnt, acl);
+                acl = &result->acl_access;
-        return (err > 0) ? 0 : err;
+        aclcnt = NULL;
+        if (result->mask & NFS_ACLCNT)
+                aclcnt = &result->acl_access_count;
+        error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+        if (unlikely(error <= 0))
+                goto out;
+        acl = NULL;
+        if (result->mask & NFS_DFACL)
+                acl = &result->acl_default;
+        aclcnt = NULL;
+        if (result->mask & NFS_DFACLCNT)
+                aclcnt = &result->acl_default_count;
+        error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+        if (unlikely(error <= 0))
+                return error;
+        error = 0;
+out:
+        return error;
 }
-/*
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
- * Decode setacl reply.
+                                   struct xdr_stream *xdr,
- */
+                                   struct nfs3_getaclres *result)
-static int
-nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
-        int status = ntohl(*p++);
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_getacl3resok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        if (status)
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
-                return nfs_stat_to_errno(status);
+                                   struct xdr_stream *xdr,
-        xdr_decode_post_op_attr(p, fattr);
+                                   struct nfs_fattr *result)
-        return 0;
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_post_op_attr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 #define PROC(proc, argtype, restype, timer)                             \
 [NFS3PROC_##proc] = {                                                   \
        .p_proc      = NFS3PROC_##proc,                                 \
-        .p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,                 \
+        .p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,      \
-        .p_decode    = (kxdrproc_t) nfs3_xdr_##restype,                 \
+        .p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,       \
-        .p_arglen    = NFS3_##argtype##_sz,                             \
+        .p_arglen    = NFS3_##argtype##args_sz,                         \
-        .p_replen    = NFS3_##restype##_sz,                             \
+        .p_replen    = NFS3_##restype##res_sz,                          \
        .p_timer     = timer,                                           \
        .p_statidx   = NFS3PROC_##proc,                                 \
        .p_name      = #proc,                                           \
        }
 struct rpc_procinfo     nfs3_procedures[] = {
-  PROC(GETATTR,         fhandle,        attrstat, 1),
+        PROC(GETATTR,           getattr,        getattr,        1),
-  PROC(SETATTR,         sattrargs,      wccstat, 0),
+        PROC(SETATTR,           setattr,        setattr,        0),
-  PROC(LOOKUP,          diropargs,      lookupres, 2),
+        PROC(LOOKUP,            lookup,         lookup,         2),
-  PROC(ACCESS,          accessargs,     accessres, 1),
+        PROC(ACCESS,            access,         access,         1),
-  PROC(READLINK,        readlinkargs,   readlinkres, 3),
+        PROC(READLINK,          readlink,       readlink,       3),
-  PROC(READ,            readargs,       readres, 3),
+        PROC(READ,              read,           read,           3),
-  PROC(WRITE,           writeargs,      writeres, 4),
+        PROC(WRITE,             write,          write,          4),
-  PROC(CREATE,          createargs,     createres, 0),
+        PROC(CREATE,            create,         create,         0),
-  PROC(MKDIR,           mkdirargs,      createres, 0),
+        PROC(MKDIR,             mkdir,          create,         0),
-  PROC(SYMLINK,         symlinkargs,    createres, 0),
+        PROC(SYMLINK,           symlink,        create,         0),
-  PROC(MKNOD,           mknodargs,      createres, 0),
+        PROC(MKNOD,             mknod,          create,         0),
-  PROC(REMOVE,          removeargs,     removeres, 0),
+        PROC(REMOVE,            remove,         remove,         0),
-  PROC(RMDIR,           diropargs,      wccstat, 0),
+        PROC(RMDIR,             lookup,         setattr,        0),
-  PROC(RENAME,          renameargs,     renameres, 0),
+        PROC(RENAME,            rename,         rename,         0),
-  PROC(LINK,            linkargs,       linkres, 0),
+        PROC(LINK,              link,           link,           0),
-  PROC(READDIR,         readdirargs,    readdirres, 3),
+        PROC(READDIR,           readdir,        readdir,        3),
-  PROC(READDIRPLUS,     readdirargs,    readdirres, 3),
+        PROC(READDIRPLUS,       readdirplus,    readdir,        3),
-  PROC(FSSTAT,          fhandle,        fsstatres, 0),
+        PROC(FSSTAT,            getattr,        fsstat,         0),
-  PROC(FSINFO,          fhandle,        fsinfores, 0),
+        PROC(FSINFO,            getattr,        fsinfo,         0),
-  PROC(PATHCONF,        fhandle,        pathconfres, 0),
+        PROC(PATHCONF,          getattr,        pathconf,       0),
-  PROC(COMMIT,          commitargs,     commitres, 5),
+        PROC(COMMIT,            commit,         commit,         5),
 };
 struct rpc_version              nfs_version3 = {
@@ -1185,8 +2468,8 @@ struct rpc_version		nfs_version3 = {
 static struct rpc_procinfo      nfs3_acl_procedures[] = {
        [ACLPROC3_GETACL] = {
                .p_proc = ACLPROC3_GETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
                .p_arglen = ACL3_getaclargs_sz,
                .p_replen = ACL3_getaclres_sz,
                .p_timer = 1,
@@ -1194,8 +2477,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
        },
        [ACLPROC3_SETACL] = {
                .p_proc = ACLPROC3_SETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
                .p_arglen = ACL3_setaclargs_sz,
                .p_replen = ACL3_setaclres_sz,
                .p_timer = 0,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9fa496387fdf..7a7474073148 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_REBOOT,
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
+        NFS4CLNT_LAYOUTRECALL,
        NFS4CLNT_SESSION_RESET,
        NFS4CLNT_RECALL_SLOT,
 };
@@ -109,7 +110,7 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
        struct nfs_unique_id so_owner_id;
        struct nfs_server    *so_server;
-        struct rb_node       so_client_node;
+        struct rb_node       so_server_node;
        struct rpc_cred      *so_cred;   /* Associated cred */
@@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops {
 extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
-/* inode.c */
-extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
-extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
-extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
@@ -241,11 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -331,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e92f0d8d654..23f930caf1e2 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 {
        struct nfs4_file_layout_dsaddr *dsaddr;
        int status = -EINVAL;
-        struct nfs_server *nfss = NFS_SERVER(lo->inode);
+        struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
        dprintk("--> %s\n", __func__);
@@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        /* find and reference the deviceid */
        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
        if (dsaddr == NULL) {
-                dsaddr = get_device_info(lo->inode, id);
+                dsaddr = get_device_info(lo->plh_inode, id);
                if (dsaddr == NULL)
                        goto out;
        }
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 static void
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 {
-        struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+        struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4435e5e1f904..9d992b0346e3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -49,6 +49,7 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/xattr.h>
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -355,9 +356,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 }
 /*
- * Signal state manager thread if session is drained
+ * Signal state manager thread if session fore channel is drained
 */
-static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
+static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
 {
        struct rpc_task *task;
@@ -371,8 +372,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
        if (ses->fc_slot_table.highest_used_slotid != -1)
                return;
-        dprintk("%s COMPLETE: Session Drained\n", __func__);
+        dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
-        complete(&ses->complete);
+        complete(&ses->fc_slot_table.complete);
+}
+/*
+ * Signal state manager thread if session back channel is drained
+ */
+void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
+{
+        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
+            ses->bc_slot_table.highest_used_slotid != -1)
+                return;
+        dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
+        complete(&ses->bc_slot_table.complete);
 }
 static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -389,7 +402,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        spin_lock(&tbl->slot_tbl_lock);
        nfs4_free_slot(tbl, res->sr_slot);
-        nfs41_check_drain_session_complete(res->sr_session);
+        nfs4_check_drain_fc_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slot = NULL;
 }
@@ -1826,6 +1839,8 @@ struct nfs4_closedata {
        struct nfs_closeres res;
        struct nfs_fattr fattr;
        unsigned long timestamp;
+        bool roc;
+        u32 roc_barrier;
 };
 static void nfs4_free_closedata(void *data)
@@ -1833,6 +1848,8 @@ static void nfs4_free_closedata(void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state_owner *sp = calldata->state->owner;
+        if (calldata->roc)
+                pnfs_roc_release(calldata->state->inode);
        nfs4_put_open_state(calldata->state);
        nfs_free_seqid(calldata->arg.seqid);
        nfs4_put_state_owner(sp);
@@ -1865,6 +1882,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
         */
        switch (task->tk_status) {
                case 0:
+                        if (calldata->roc)
+                                pnfs_roc_set_barrier(state->inode,
+                                                     calldata->roc_barrier);
                        nfs_set_open_stateid(state, &calldata->res.stateid, 0);
                        renew_lease(server, calldata->timestamp);
                        nfs4_close_clear_stateid_flags(state,
@@ -1917,8 +1937,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                return;
        }
-        if (calldata->arg.fmode == 0)
+        if (calldata->arg.fmode == 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+                if (calldata->roc &&
+                    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+                        rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+                                     task, NULL);
+                        return;
+                }
+        }
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
@@ -1946,7 +1973,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
 *
 * NOTE: Caller must be holding the sp->so_owner semaphore!
 */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_closedata *calldata;
@@ -1981,11 +2008,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
+        calldata->roc = roc;
        path_get(path);
        calldata->path = *path;
-        msg.rpc_argp = &calldata->arg,
+        msg.rpc_argp = &calldata->arg;
-        msg.rpc_resp = &calldata->res,
+        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -1998,6 +2026,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 out_free_calldata:
        kfree(calldata);
 out:
+        if (roc)
+                pnfs_roc_release(state->inode);
        nfs4_put_open_state(state);
        nfs4_put_state_owner(sp);
        return status;
@@ -2486,6 +2516,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                path = &ctx->path;
                fmode = ctx->mode;
        }
+        sattr->ia_mode &= ~current_umask();
        state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
@@ -2816,6 +2847,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mkdir(dir, dentry, sattr),
@@ -2916,6 +2949,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mknod(dir, dentry, sattr, rdev),
@@ -3478,6 +3513,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        struct nfs4_setclientid setclientid = {
                .sc_verifier = &sc_verifier,
                .sc_prog = program,
+                .sc_cb_ident = clp->cl_cb_ident,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -3517,7 +3553,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                if (signalled())
                        break;
                if (loop++ & 1)
-                        ssleep(clp->cl_lease_time + 1);
+                        ssleep(clp->cl_lease_time / HZ + 1);
                else
                        if (++clp->cl_id_uniquifier == 0)
                                break;
@@ -3663,8 +3699,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        data->rpc_status = 0;
        task_setup_data.callback_data = data;
-        msg.rpc_argp = &data->args,
+        msg.rpc_argp = &data->args;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -3743,6 +3779,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
        arg.lock_owner.id = lsp->ls_id.id;
+        arg.lock_owner.s_dev = server->s_dev;
        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        switch (status) {
                case 0:
@@ -3908,8 +3945,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                return ERR_PTR(-ENOMEM);
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        return rpc_run_task(&task_setup_data);
 }
@@ -3988,6 +4025,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.lock_stateid = &lsp->ls_stateid;
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
+        p->arg.lock_owner.s_dev = server->s_dev;
        p->res.lock_seqid = p->arg.lock_seqid;
        p->lsp = lsp;
        p->server = server;
@@ -4145,8 +4183,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                        data->arg.reclaim = NFS_LOCK_RECLAIM;
                task_setup_data.callback_ops = &nfs4_recover_lock_ops;
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -4392,48 +4430,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
                return;
        args->lock_owner.clientid = server->nfs_client->cl_clientid;
        args->lock_owner.id = lsp->ls_id.id;
+        args->lock_owner.s_dev = server->s_dev;
        msg.rpc_argp = args;
        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
 }
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
-int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
+static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
-                size_t buflen, int flags)
+                                   const void *buf, size_t buflen,
+                                   int flags, int type)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_set_acl(inode, buf, buflen);
+        return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
 }
-/* The getxattr man page suggests returning -ENODATA for unknown attributes,
+static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
- * and that's what we'll do for e.g. user attributes that haven't been set.
+                                   void *buf, size_t buflen, int type)
- * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
- * attributes in kernel-managed attribute namespaces. */
-ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
-                size_t buflen)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_get_acl(inode, buf, buflen);
+        return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
 }
-ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
+static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+                                       size_t list_len, const char *name,
+                                       size_t name_len, int type)
 {
-        size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
+        size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
        if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
                return 0;
-        if (buf && buflen < len)
-                return -ERANGE;
+        if (list && len <= list_len)
-        if (buf)
+                memcpy(list, XATTR_NAME_NFSV4_ACL, len);
-                memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
        return len;
 }
@@ -4486,6 +4519,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 #ifdef CONFIG_NFS_V4_1
 /*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags)
+{
+        if (flags & ~EXCHGID4_FLAG_MASK_R)
+                goto out_inval;
+        if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+            (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+                goto out_inval;
+        if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+                goto out_inval;
+        return NFS_OK;
+out_inval:
+        return -NFS4ERR_INVAL;
+}
+/*
 * nfs4_proc_exchange_id()
 *
 * Since the clientid has expired, all compounds using sessions
@@ -4498,7 +4550,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        nfs4_verifier verifier;
        struct nfs41_exchange_id_args args = {
                .client = clp,
-                .flags = clp->cl_exchange_flags,
+                .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
        };
        struct nfs41_exchange_id_res res = {
                .client = clp,
@@ -4515,9 +4567,6 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        dprintk("--> %s\n", __func__);
        BUG_ON(clp == NULL);
-        /* Remove server-only flags */
-        args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
        p = (u32 *)verifier.data;
        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
@@ -4543,6 +4592,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
                        break;
        }
+        status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
        dprintk("<-- %s status= %d\n", __func__, status);
        return status;
 }
@@ -4776,17 +4826,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        if (!session)
                return NULL;
-        init_completion(&session->complete);
        tbl = &session->fc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        init_completion(&tbl->complete);
        tbl = &session->bc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        init_completion(&tbl->complete);
        session->session_state = 1<<NFS4_SESSION_INITING;
@@ -5280,13 +5330,23 @@ static void
 nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutget *lgp = calldata;
-        struct inode *ino = lgp->args.inode;
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
-        struct nfs_server *server = NFS_SERVER(ino);
        dprintk("--> %s\n", __func__);
+        /* Note the is a race here, where a CB_LAYOUTRECALL can come in
+         * right now covering the LAYOUTGET we are about to send.
+         * However, that is not so catastrophic, and there seems
+         * to be no way to prevent it completely.
+         */
        if (nfs4_setup_sequence(server, &lgp->args.seq_args,
                                &lgp->res.seq_res, 0, task))
                return;
+        if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+                                          NFS_I(lgp->args.inode)->layout,
+                                          lgp->args.ctx->state)) {
+                rpc_exit(task, NFS4_OK);
+                return;
+        }
        rpc_call_start(task);
 }
@@ -5313,7 +5373,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
                        return;
                }
        }
-        lgp->status = task->tk_status;
        dprintk("<-- %s\n", __func__);
 }
@@ -5322,7 +5381,6 @@ static void nfs4_layoutget_release(void *calldata)
        struct nfs4_layoutget *lgp = calldata;
        dprintk("--> %s\n", __func__);
-        put_layout_hdr(lgp->args.inode);
        if (lgp->res.layout.buf != NULL)
                free_page((unsigned long) lgp->res.layout.buf);
        put_nfs_open_context(lgp->args.ctx);
@@ -5367,13 +5425,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
        if (IS_ERR(task))
                return PTR_ERR(task);
        status = nfs4_wait_for_completion_rpc_task(task);
-        if (status != 0)
+        if (status == 0)
-                goto out;
+                status = task->tk_status;
-        status = lgp->status;
+        if (status == 0)
-        if (status != 0)
+                status = pnfs_layout_process(lgp);
-                goto out;
-        status = pnfs_layout_process(lgp);
-out:
        rpc_put_task(task);
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
@@ -5504,9 +5559,10 @@ static const struct inode_operations nfs4_file_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 const struct nfs_rpc_ops nfs_v4_clientops = {
@@ -5551,6 +5607,18 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .open_context   = nfs4_atomic_open,
 };
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+        .prefix = XATTR_NAME_NFSV4_ACL,
+        .list   = nfs4_xattr_list_nfs4_acl,
+        .get    = nfs4_xattr_get_nfs4_acl,
+        .set    = nfs4_xattr_set_nfs4_acl,
+};
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+        &nfs4_xattr_nfs4_acl_handler,
+        NULL
+};
 /*
 * Local variables:
 *  c-basic-offset: 8
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 72b6c580af13..402143d75fc5 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work)
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
-        /* Are there any active superblocks? */
-        if (list_empty(&clp->cl_superblocks))
+        rcu_read_lock();
+        if (list_empty(&clp->cl_superblocks)) {
+                rcu_read_unlock();
                goto out;
+        }
+        rcu_read_unlock();
        spin_lock(&clp->cl_lock);
        lease = clp->cl_lease_time;
        last = clp->cl_last_renewal;
@@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
                cred = ops->get_state_renewal_cred_locked(clp);
                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        if (list_empty(&clp->cl_delegations)) {
+                        if (!nfs_delegations_present(clp)) {
                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                                goto out;
                        }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f575a3126737..2336d532cf66 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -105,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
                put_rpccred(cred);
 }
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
 {
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
-        struct rpc_cred *cred = NULL;
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                if (list_empty(&sp->so_states))
                        continue;
                cred = get_rpccred(sp->so_cred);
@@ -121,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
        return cred;
 }
+/**
+ * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+{
+        struct rpc_cred *cred = NULL;
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                cred = nfs4_get_renew_cred_server_locked(server);
+                if (cred != NULL)
+                        break;
+        }
+        rcu_read_unlock();
+        return cred;
+}
 #if defined(CONFIG_NFS_V4_1)
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -142,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        return status;
 }
+/*
+ * Back channel returns NFS4ERR_DELAY for new requests when
+ * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
+ * is ended.
+ */
 static void nfs4_end_drain_session(struct nfs_client *clp)
 {
        struct nfs4_session *ses = clp->cl_session;
@@ -165,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
        }
 }
-static int nfs4_begin_drain_session(struct nfs_client *clp)
+static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
-        struct nfs4_session *ses = clp->cl_session;
-        struct nfs4_slot_table *tbl = &ses->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
        if (tbl->highest_used_slotid != -1) {
-                INIT_COMPLETION(ses->complete);
+                INIT_COMPLETION(tbl->complete);
                spin_unlock(&tbl->slot_tbl_lock);
-                return wait_for_completion_interruptible(&ses->complete);
+                return wait_for_completion_interruptible(&tbl->complete);
        }
        spin_unlock(&tbl->slot_tbl_lock);
        return 0;
 }
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+        struct nfs4_session *ses = clp->cl_session;
+        int ret = 0;
+        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
+        /* back channel */
+        ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+        if (ret)
+                return ret;
+        /* fore channel */
+        return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+}
 int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
        int status;
@@ -192,6 +232,12 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
        status = nfs4_proc_create_session(clp);
        if (status != 0)
                goto out;
+        status = nfs4_set_callback_sessionid(clp);
+        if (status != 0) {
+                printk(KERN_WARNING "Sessionid not set. No callback service\n");
+                nfs_callback_down(1);
+                status = 0;
+        }
        nfs41_setup_state_renewal(clp);
        nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
@@ -210,28 +256,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 #endif /* CONFIG_NFS_V4_1 */
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_setclientid_cred_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
+        spin_lock(&clp->cl_lock);
+        pos = rb_first(&server->state_owners);
+        if (pos != NULL) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+                cred = get_rpccred(sp->so_cred);
+        }
+        spin_unlock(&clp->cl_lock);
+        return cred;
+}
+/**
+ * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+{
+        struct nfs_server *server;
        struct rpc_cred *cred;
        spin_lock(&clp->cl_lock);
        cred = nfs4_get_machine_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
        if (cred != NULL)
                goto out;
-        pos = rb_first(&clp->cl_state_owners);
-        if (pos != NULL) {
+        rcu_read_lock();
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                cred = get_rpccred(sp->so_cred);
+                cred = nfs4_get_setclientid_cred_server(server);
+                if (cred != NULL)
+                        break;
        }
+        rcu_read_unlock();
 out:
-        spin_unlock(&clp->cl_lock);
        return cred;
 }
-static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
+static void nfs_alloc_unique_id_locked(struct rb_root *root,
-                __u64 minval, int maxbits)
+                                       struct nfs_unique_id *new,
+                                       __u64 minval, int maxbits)
 {
        struct rb_node **p, *parent;
        struct nfs_unique_id *pos;
@@ -286,16 +360,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
 }
 static struct nfs4_state_owner *
-nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
-        struct nfs_client *clp = server->nfs_client;
+        struct rb_node **p = &server->state_owners.rb_node,
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp, *res = NULL;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
                if (server < sp->so_server) {
                        p = &parent->rb_left;
@@ -319,24 +392,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 }
 static struct nfs4_state_owner *
-nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
 {
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
+        struct nfs_server *server = new->so_server;
+        struct rb_node **p = &server->state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
-                if (new->so_server < sp->so_server) {
-                        p = &parent->rb_left;
-                        continue;
-                }
-                if (new->so_server > sp->so_server) {
-                        p = &parent->rb_right;
-                        continue;
-                }
                if (new->so_cred < sp->so_cred)
                        p = &parent->rb_left;
                else if (new->so_cred > sp->so_cred)
@@ -346,18 +412,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
                        return sp;
                }
        }
-        nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->openowner_id,
-        rb_link_node(&new->so_client_node, parent, p);
+                                        &new->so_owner_id, 1, 64);
-        rb_insert_color(&new->so_client_node, &clp->cl_state_owners);
+        rb_link_node(&new->so_server_node, parent, p);
+        rb_insert_color(&new->so_server_node, &server->state_owners);
        return new;
 }
 static void
-nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node))
+        struct nfs_server *server = sp->so_server;
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
-        nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id);
+        if (!RB_EMPTY_NODE(&sp->so_server_node))
+                rb_erase(&sp->so_server_node, &server->state_owners);
+        nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
 }
 /*
@@ -386,23 +455,32 @@ nfs4_alloc_state_owner(void)
 static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node)) {
+        if (!RB_EMPTY_NODE(&sp->so_server_node)) {
-                struct nfs_client *clp = sp->so_server->nfs_client;
+                struct nfs_server *server = sp->so_server;
+                struct nfs_client *clp = server->nfs_client;
                spin_lock(&clp->cl_lock);
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
+                rb_erase(&sp->so_server_node, &server->state_owners);
-                RB_CLEAR_NODE(&sp->so_client_node);
+                RB_CLEAR_NODE(&sp->so_server_node);
                spin_unlock(&clp->cl_lock);
        }
 }
-struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+                                              struct rpc_cred *cred)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp, *new;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_find_state_owner(server, cred);
+        sp = nfs4_find_state_owner_locked(server, cred);
        spin_unlock(&clp->cl_lock);
        if (sp != NULL)
                return sp;
@@ -412,7 +490,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        new->so_server = server;
        new->so_cred = cred;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_insert_state_owner(clp, new);
+        sp = nfs4_insert_state_owner_locked(new);
        spin_unlock(&clp->cl_lock);
        if (sp == new)
                get_rpccred(cred);
@@ -423,6 +501,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        return sp;
 }
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
        struct nfs_client *clp = sp->so_server->nfs_client;
@@ -430,7 +513,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
        if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
                return;
-        nfs4_remove_state_owner(clp, sp);
+        nfs4_remove_state_owner_locked(sp);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&sp->so_sequence.wait);
        put_rpccred(cred);
@@ -585,8 +668,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
        if (!call_close) {
                nfs4_put_open_state(state);
                nfs4_put_state_owner(owner);
-        } else
+        } else {
-                nfs4_do_close(path, state, gfp_mask, wait);
+                bool roc = pnfs_roc(state->inode);
+                nfs4_do_close(path, state, gfp_mask, wait, roc);
+        }
 }
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
@@ -633,7 +719,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
 static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
        struct nfs4_lock_state *lsp;
-        struct nfs_client *clp = state->owner->so_server->nfs_client;
+        struct nfs_server *server = state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
@@ -657,7 +744,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
                return NULL;
        }
        spin_lock(&clp->cl_lock);
-        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
        spin_unlock(&clp->cl_lock);
        INIT_LIST_HEAD(&lsp->ls_locks);
        return lsp;
@@ -665,10 +752,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-        struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
+        struct nfs_server *server = lsp->ls_state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        spin_lock(&clp->cl_lock);
-        nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
+        nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
        kfree(lsp);
@@ -1114,15 +1202,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
        }
 }
-static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+static void nfs4_reset_seqids(struct nfs_server *server,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        /* Reset all sequence ids to zero */
+        spin_lock(&clp->cl_lock);
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                sp->so_seqid.flags = 0;
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1131,6 +1223,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_reset_seqids(server, mark_reclaim);
+        rcu_read_unlock();
 }
 static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1148,25 +1252,41 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
                (void)ops->reclaim_complete(clp);
 }
-static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+        spin_lock(&clp->cl_lock);
-                return 0;
+        for (pos = rb_first(&server->state_owners);
+             pos != NULL;
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+             pos = rb_next(pos)) {
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
-                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+                                                &state->flags))
                                continue;
                        nfs4_state_mark_reclaim_nograce(clp, state);
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+                return 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_clear_reclaim_server(server);
+        rcu_read_unlock();
        nfs_delegation_reap_unclaimed(clp);
        return 1;
@@ -1238,27 +1358,40 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
 {
+        struct nfs4_state_owner *sp;
+        struct nfs_server *server;
        struct rb_node *pos;
        int status = 0;
 restart:
-        spin_lock(&clp->cl_lock);
+        rcu_read_lock();
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                spin_lock(&clp->cl_lock);
-                if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
+                for (pos = rb_first(&server->state_owners);
-                        continue;
+                     pos != NULL;
-                atomic_inc(&sp->so_count);
+                     pos = rb_next(pos)) {
-                spin_unlock(&clp->cl_lock);
+                        sp = rb_entry(pos,
-                status = nfs4_reclaim_open_state(sp, ops);
+                                struct nfs4_state_owner, so_server_node);
-                if (status < 0) {
+                        if (!test_and_clear_bit(ops->owner_flag_bit,
-                        set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                                        &sp->so_flags))
+                                continue;
+                        atomic_inc(&sp->so_count);
+                        spin_unlock(&clp->cl_lock);
+                        rcu_read_unlock();
+                        status = nfs4_reclaim_open_state(sp, ops);
+                        if (status < 0) {
+                                set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                nfs4_put_state_owner(sp);
+                                return nfs4_recovery_handle_error(clp, status);
+                        }
                        nfs4_put_state_owner(sp);
-                        return nfs4_recovery_handle_error(clp, status);
+                        goto restart;
                }
-                nfs4_put_state_owner(sp);
+                spin_unlock(&clp->cl_lock);
-                goto restart;
        }
-        spin_unlock(&clp->cl_lock);
+        rcu_read_unlock();
        return status;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f1826b012e6..2ab8e5cb8f59 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -71,8 +71,8 @@ static int nfs4_stat_to_errno(int);
 /* lock,open owner id:
 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
 */
-#define open_owner_id_maxsz     (1 + 4)
+#define open_owner_id_maxsz     (1 + 1 + 4)
-#define lock_owner_id_maxsz     (1 + 4)
+#define lock_owner_id_maxsz     (1 + 1 + 4)
 #define decode_lockowner_maxsz  (1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define compound_encode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
 #define compound_decode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
@@ -1088,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
 {
        __be32 *p;
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, lowner->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+        *p++ = cpu_to_be32(lowner->s_dev);
        xdr_encode_hyper(p, lowner->id);
 }
@@ -1210,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
        *p++ = cpu_to_be32(OP_OPEN);
        *p = cpu_to_be32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, arg->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "open id:", 8);
+        *p++ = cpu_to_be32(arg->server->s_dev);
        xdr_encode_hyper(p, arg->id);
 }
@@ -1510,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_restorefh_maxsz;
 }
-static int
+static void
 encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1521,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
        p = reserve_space(xdr, 2*4);
        *p++ = cpu_to_be32(1);
        *p = cpu_to_be32(FATTR4_WORD0_ACL);
-        if (arg->acl_len % 4)
+        BUG_ON(arg->acl_len % 4);
-                return -EINVAL;
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
        hdr->nops++;
        hdr->replen += decode_setacl_maxsz;
-        return 0;
 }
 static void
@@ -1789,7 +1789,6 @@ encode_layoutget(struct xdr_stream *xdr,
                      const struct nfs4_layoutget_args *args,
                      struct compound_hdr *hdr)
 {
-        nfs4_stateid stateid;
        __be32 *p;
        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1800,9 +1799,7 @@ encode_layoutget(struct xdr_stream *xdr,
        p = xdr_encode_hyper(p, args->range.offset);
        p = xdr_encode_hyper(p, args->range.length);
        p = xdr_encode_hyper(p, args->minlength);
-        pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+        p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
-                                args->ctx->state);
-        p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
        *p = cpu_to_be32(args->maxcount);
        dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
@@ -1833,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 /*
 * Encode an ACCESS request
 */
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_accessargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_access(xdr, args->access, &hdr);
-        encode_access(&xdr, args->access, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP request
 */
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_lookup_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP_ROOT request
 */
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_lookup_root_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode REMOVE request
 */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_removeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_remove(xdr, &args->name, &hdr);
-        encode_remove(&xdr, &args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode RENAME request
 */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_renameargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->old_dir, &hdr);
-        encode_putfh(&xdr, args->old_dir, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->new_dir, &hdr);
-        encode_putfh(&xdr, args->new_dir, &hdr);
+        encode_rename(xdr, args->old_name, args->new_name, &hdr);
-        encode_rename(&xdr, args->old_name, args->new_name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LINK request
 */
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             const struct nfs4_link_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_link(xdr, args->name, &hdr);
-        encode_link(&xdr, args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode CREATE request
 */
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_create_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_create(xdr, args, &hdr);
-        encode_create(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode SYMLINK request
 */
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_create_arg *args)
 {
-        return nfs4_xdr_enc_create(req, p, args);
+        nfs4_xdr_enc_create(req, xdr, args);
 }
 /*
 * Encode GETATTR request
 */
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_getattr_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a CLOSE request
 */
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_close(xdr, args, &hdr);
-        encode_close(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request
 */
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_CONFIRM request
 */
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs_open_confirmargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_confirm(xdr, args, &hdr);
-        encode_open_confirm(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request with no attributes.
 */
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_DOWNGRADE request
 */
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+                                        struct xdr_stream *xdr,
+                                        struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_downgrade(xdr, args, &hdr);
-        encode_open_downgrade(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCK request
 */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_lock_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lock(xdr, args, &hdr);
-        encode_lock(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKT request
 */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_lockt_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lockt(xdr, args, &hdr);
-        encode_lockt(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKU request
 */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_locku_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_locku(xdr, args, &hdr);
-        encode_locku(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
-static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+                                           struct xdr_stream *xdr,
+                                        struct nfs_release_lockowner_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_release_lockowner(xdr, &args->lock_owner, &hdr);
-        encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READLINK request
 */
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_readlink *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readlink(xdr, args, req, &hdr);
-        encode_readlink(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                        args->pgbase, args->pglen);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READDIR request
 */
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_readdir_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readdir(xdr, args, req, &hdr);
-        encode_readdir(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                         args->pgbase, args->count);
@@ -2227,428 +2193,387 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
                        __func__, hdr.replen << 2, args->pages,
                        args->pgbase, args->count);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READ request
 */
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_readargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_read(xdr, args, &hdr);
-        encode_read(&xdr, args, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an SETATTR request
 */
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_setattrargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setattr(xdr, args, args->server, &hdr);
-        encode_setattr(&xdr, args, args->server, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a GETACL request
 */
-static int
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
+                                struct nfs_getaclargs *args)
-                struct nfs_getaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
        replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
-        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
+        encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
                args->acl_pages, args->acl_pgbase, args->acl_len);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a WRITE request
 */
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_write(xdr, args, &hdr);
-        encode_write(&xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 *  a COMMIT request
 */
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_commit(xdr, args, &hdr);
-        encode_commit(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * FSINFO request
 */
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs4_fsinfo_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_fsinfo(xdr, args->bitmask, &hdr);
-        encode_fsinfo(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a PATHCONF request
 */
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_pathconf_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
-        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
                           &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a STATFS request
 */
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_statfs_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
-        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * GETATTR_BITMAP request
 */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
-                                    struct nfs4_server_caps_arg *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_server_caps_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
                           FATTR4_WORD0_LINK_SUPPORT|
                           FATTR4_WORD0_SYMLINK_SUPPORT|
                           FATTR4_WORD0_ACLSUPPORT, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RENEW request
 */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_client *clp)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_renew(xdr, clp, &hdr);
-        encode_renew(&xdr, clp, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID request
 */
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_setclientid *sc)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid(xdr, sc, &hdr);
-        encode_setclientid(&xdr, sc, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+                                             struct xdr_stream *xdr,
+                                             struct nfs4_setclientid_res *arg)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid_confirm(xdr, arg, &hdr);
-        encode_setclientid_confirm(&xdr, arg, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * DELEGRETURN request
 */
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_delegreturnargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_delegreturn(xdr, args->stateid, &hdr);
-        encode_delegreturn(&xdr, args->stateid, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode FS_LOCATIONS request
 */
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_fs_locations_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
        replen = hdr.replen;    /* get the attribute into args->page */
-        encode_fs_locations(&xdr, args->bitmask, &hdr);
+        encode_fs_locations(xdr, args->bitmask, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
                        0, PAGE_SIZE);
        encode_nops(&hdr);
-        return 0;
 }
 #if defined(CONFIG_NFS_V4_1)
 /*
 * EXCHANGE_ID request
 */
-static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
-                                    struct nfs41_exchange_id_args *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs41_exchange_id_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_exchange_id(xdr, args, &hdr);
-        encode_exchange_id(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a CREATE_SESSION request
 */
-static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
-                                       struct nfs41_create_session_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs41_create_session_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_create_session(xdr, args, &hdr);
-        encode_create_session(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a DESTROY_SESSION request
 */
-static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
-                                        struct nfs4_session *session)
+                                         struct xdr_stream *xdr,
+                                         struct nfs4_session *session)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = session->clp->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_destroy_session(xdr, session, &hdr);
-        encode_destroy_session(&xdr, session, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SEQUENCE request
 */
-static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
-                                 struct nfs4_sequence_args *args)
+                                  struct nfs4_sequence_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, args, &hdr);
-        encode_sequence(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a GET_LEASE_TIME request
 */
-static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
-                                       struct nfs4_get_lease_time_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs4_get_lease_time_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->la_seq_args, &hdr);
-        encode_sequence(&xdr, &args->la_seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RECLAIM_COMPLETE request
 */
-static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
-                                     struct nfs41_reclaim_complete_args *args)
+                                          struct xdr_stream *xdr,
+                                struct nfs41_reclaim_complete_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args)
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_reclaim_complete(xdr, args, &hdr);
-        encode_reclaim_complete(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode GETDEVICEINFO request
 */
-static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
-                                      struct nfs4_getdeviceinfo_args *args)
+                                       struct xdr_stream *xdr,
+                                       struct nfs4_getdeviceinfo_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_getdeviceinfo(xdr, args, &hdr);
-        encode_getdeviceinfo(&xdr, args, &hdr);
        /* set up reply kvec. Subtract notification bitmap max size (2)
         * so that notification bitmap is put in xdr_buf tail */
@@ -2657,27 +2582,24 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
                         args->pdev->pglen);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 *  Encode LAYOUTGET request
 */
-static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
-                                  struct nfs4_layoutget_args *args)
+                                   struct xdr_stream *xdr,
+                                   struct nfs4_layoutget_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
-        encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
+        encode_layoutget(xdr, args, &hdr);
-        encode_layoutget(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -4475,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
                goto out_overflow;
        eof = be32_to_cpup(p++);
        count = be32_to_cpup(p);
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
                dprintk("NFS: server cheating in read reply: "
@@ -5000,7 +4922,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
                goto out_overflow;
        len = be32_to_cpup(p);
        if (len) {
-                int i;
+                uint32_t i;
                p = xdr_inline_decode(xdr, 4 * len);
                if (unlikely(!p))
@@ -5090,26 +5012,26 @@ out_overflow:
 /*
 * Decode OPEN_DOWNGRADE response
 */
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_downgrade(&xdr, res);
+        status = decode_open_downgrade(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5118,26 +5040,25 @@ out:
 /*
 * Decode ACCESS response
 */
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_accessres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_access(&xdr, res);
+        status = decode_access(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5146,26 +5067,28 @@ out:
 /*
 * Decode LOOKUP response
 */
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server
+        status = decode_getfattr(xdr, res->fattr, res->server
                        ,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5174,23 +5097,25 @@ out:
 /*
 * Decode LOOKUP_ROOT response
 */
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putrootfh(&xdr)) != 0)
+        status = decode_putrootfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) == 0)
+        status = decode_getfh(xdr, res->fh);
-                status = decode_getfattr(&xdr, res->fattr, res->server,
+        if (status == 0)
+                status = decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5199,24 +5124,25 @@ out:
 /*
 * Decode REMOVE response
 */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_removeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
+        status = decode_remove(xdr, &res->cinfo);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5225,34 +5151,38 @@ out:
 /*
 * Decode RENAME response
 */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_renameres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
+        status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+        if (status)
                goto out;
        /* Current FH is target directory */
-        if (decode_getfattr(&xdr, res->new_fattr, res->server,
+        if (decode_getfattr(xdr, res->new_fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->old_fattr, res->server,
+        decode_getfattr(xdr, res->old_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5261,37 +5191,41 @@ out:
 /*
 * Decode LINK response
 */
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs4_link_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_link(&xdr, &res->cinfo)) != 0)
+        status = decode_link(xdr, &res->cinfo);
+        if (status)
                goto out;
        /*
         * Note order: OP_LINK leaves the directory as the current
         *             filehandle.
         */
-        if (decode_getfattr(&xdr, res->dir_attr, res->server,
+        if (decode_getfattr(xdr, res->dir_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5300,33 +5234,37 @@ out:
 /*
 * Decode CREATE response
 */
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_create_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0)
+        status = decode_create(xdr, &res->dir_cinfo);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        if (decode_getfattr(&xdr, res->fattr, res->server,
+        if (decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_fattr, res->server,
+        decode_getfattr(xdr, res->dir_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5335,31 +5273,31 @@ out:
 /*
 * Decode SYMLINK response
 */
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_create_res *res)
 {
-        return nfs4_xdr_dec_create(rqstp, p, res);
+        return nfs4_xdr_dec_create(rqstp, xdr, res);
 }
 /*
 * Decode GETATTR response
 */
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_getattr_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server,
+        status = decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5368,46 +5306,40 @@ out:
 /*
 * Encode an SETACL request
 */
-static int
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
+                                struct nfs_setaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        int status;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setacl(xdr, args, &hdr);
-        status = encode_setacl(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return status;
 }
 /*
 * Decode SETACL response
 */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_setaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
 out:
        return status;
 }
@@ -5416,24 +5348,22 @@ out:
 * Decode GETACL response
 */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_getaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getacl(&xdr, rqstp, &res->acl_len);
+        status = decode_getacl(xdr, rqstp, &res->acl_len);
 out:
        return status;
@@ -5442,23 +5372,22 @@ out:
 /*
 * Decode CLOSE response
 */
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_close(&xdr, res);
+        status = decode_close(xdr, res);
        if (status != 0)
                goto out;
        /*
@@ -5467,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
         *      an ESTALE error. Shouldn't be a problem,
         *      though, since fattr->valid will remain unset.
         */
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5476,36 +5405,35 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_savefh(&xdr);
+        status = decode_savefh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        if (decode_getfh(&xdr, &res->fh) != 0)
+        if (decode_getfh(xdr, &res->fh) != 0)
                goto out;
-        if (decode_getfattr(&xdr, res->f_attr, res->server,
+        if (decode_getfattr(xdr, res->f_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if (decode_restorefh(&xdr) != 0)
+        if (decode_restorefh(xdr) != 0)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5514,20 +5442,20 @@ out:
 /*
 * Decode OPEN_CONFIRM response
 */
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_open_confirmres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_confirm(&xdr, res);
+        status = decode_open_confirm(xdr, res);
 out:
        return status;
 }
@@ -5535,26 +5463,26 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->f_attr, res->server,
+        decode_getfattr(xdr, res->f_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5563,26 +5491,26 @@ out:
 /*
 * Decode SETATTR response
 */
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+                                struct xdr_stream *xdr,
+                                struct nfs_setattrres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5591,23 +5519,22 @@ out:
 /*
 * Decode LOCK response
 */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_lock_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lock(&xdr, res);
+        status = decode_lock(xdr, res);
 out:
        return status;
 }
@@ -5615,23 +5542,22 @@ out:
 /*
 * Decode LOCKT response
 */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_lockt_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lockt(&xdr, res);
+        status = decode_lockt(xdr, res);
 out:
        return status;
 }
@@ -5639,61 +5565,58 @@ out:
 /*
 * Decode LOCKU response
 */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_locku_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_locku(&xdr, res);
+        status = decode_locku(xdr, res);
 out:
        return status;
 }
-static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+                                          struct xdr_stream *xdr, void *dummy)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_release_lockowner(&xdr);
+                status = decode_release_lockowner(xdr);
        return status;
 }
 /*
 * Decode READLINK response
 */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_readlink_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readlink(&xdr, rqstp);
+        status = decode_readlink(xdr, rqstp);
 out:
        return status;
 }
@@ -5701,23 +5624,22 @@ out:
 /*
 * Decode READDIR response
 */
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_readdir_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readdir(&xdr, rqstp, res);
+        status = decode_readdir(xdr, rqstp, res);
 out:
        return status;
 }
@@ -5725,23 +5647,22 @@ out:
 /*
 * Decode Read response
 */
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_readres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_read(&xdr, rqstp, res);
+        status = decode_read(xdr, rqstp, res);
        if (!status)
                status = res->count;
 out:
@@ -5751,26 +5672,25 @@ out:
 /*
 * Decode WRITE response
 */
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_write(&xdr, res);
+        status = decode_write(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
@@ -5781,26 +5701,25 @@ out:
 /*
 * Decode COMMIT response
 */
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_commit(&xdr, res);
+        status = decode_commit(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5809,85 +5728,80 @@ out:
 /*
 * Decode FSINFO response
 */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_fsinfo_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->fsinfo);
+                status = decode_fsinfo(xdr, res->fsinfo);
        return status;
 }
 /*
 * Decode PATHCONF response
 */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
                                 struct nfs4_pathconf_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_pathconf(&xdr, res->pathconf);
+                status = decode_pathconf(xdr, res->pathconf);
        return status;
 }
 /*
 * Decode STATFS response
 */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_statfs_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_statfs(&xdr, res->fsstat);
+                status = decode_statfs(xdr, res->fsstat);
        return status;
 }
 /*
 * Decode GETATTR_BITMAP response
 */
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_server_caps_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        status = decode_server_caps(&xdr, res);
+        status = decode_server_caps(xdr, res);
 out:
        return status;
 }
@@ -5895,79 +5809,77 @@ out:
 /*
 * Decode RENEW response
 */
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              void *__unused)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_renew(&xdr);
+                status = decode_renew(xdr);
        return status;
 }
 /*
 * Decode SETCLIENTID response
 */
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
-                struct nfs4_setclientid_res *res)
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_setclientid_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid(&xdr, res);
+                status = decode_setclientid(xdr, res);
        return status;
 }
 /*
 * Decode SETCLIENTID_CONFIRM response
 */
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+                                            struct xdr_stream *xdr,
+                                            struct nfs_fsinfo *fsinfo)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid_confirm(&xdr);
+                status = decode_setclientid_confirm(xdr);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, fsinfo);
+                status = decode_fsinfo(xdr, fsinfo);
        return status;
 }
 /*
 * Decode DELEGRETURN response
 */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_delegreturnres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_delegreturn(&xdr);
+        status = decode_delegreturn(xdr);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5976,26 +5888,27 @@ out:
 /*
 * Decode FS_LOCATIONS response
 */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
                                     struct nfs4_fs_locations_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        xdr_enter_page(&xdr, PAGE_SIZE);
+        xdr_enter_page(xdr, PAGE_SIZE);
-        status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+        status = decode_getfattr(xdr, &res->fs_locations->fattr,
                                 res->fs_locations->server,
                                 !RPC_IS_ASYNC(req->rq_task));
 out:
@@ -6006,129 +5919,122 @@ out:
 /*
 * Decode EXCHANGE_ID response
 */
-static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
                                    void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_exchange_id(&xdr, res);
+                status = decode_exchange_id(xdr, res);
        return status;
 }
 /*
 * Decode CREATE_SESSION response
 */
-static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs41_create_session_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_create_session(&xdr, res);
+                status = decode_create_session(xdr, res);
        return status;
 }
 /*
 * Decode DESTROY_SESSION response
 */
-static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
-                                        void *dummy)
+                                        struct xdr_stream *xdr,
+                                        void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_destroy_session(&xdr, dummy);
+                status = decode_destroy_session(xdr, res);
        return status;
 }
 /*
 * Decode SEQUENCE response
 */
-static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_sequence_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, res, rqstp);
+                status = decode_sequence(xdr, res, rqstp);
        return status;
 }
 /*
 * Decode GET_LEASE_TIME response
 */
-static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs4_get_lease_time_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+                status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->lr_fsinfo);
+                status = decode_fsinfo(xdr, res->lr_fsinfo);
        return status;
 }
 /*
 * Decode RECLAIM_COMPLETE response
 */
-static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+                                         struct xdr_stream *xdr,
                                         struct nfs41_reclaim_complete_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, rqstp);
+                status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (!status)
-                status = decode_reclaim_complete(&xdr, (void *)NULL);
+                status = decode_reclaim_complete(xdr, (void *)NULL);
        return status;
 }
 /*
 * Decode GETDEVINFO response
 */
-static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+                                      struct xdr_stream *xdr,
                                      struct nfs4_getdeviceinfo_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status != 0)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status != 0)
                goto out;
-        status = decode_getdeviceinfo(&xdr, res->pdev);
+        status = decode_getdeviceinfo(xdr, res->pdev);
 out:
        return status;
 }
@@ -6136,31 +6042,44 @@ out:
 /*
 * Decode LAYOUTGET response
 */
-static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
                                  struct nfs4_layoutget_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_layoutget(&xdr, rqstp, res);
+        status = decode_layoutget(xdr, rqstp, res);
 out:
        return status;
 }
 #endif /* CONFIG_NFS_V4_1 */
-__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+/**
-                           struct nfs_server *server, int plus)
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
 {
        uint32_t bitmap[2] = {0};
        uint32_t len;
@@ -6172,9 +6091,9 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                if (unlikely(!p))
                        goto out_overflow;
                if (!ntohl(*p++))
-                        return ERR_PTR(-EAGAIN);
+                        return -EAGAIN;
                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
+                return -EBADCOOKIE;
        }
        p = xdr_inline_decode(xdr, 12);
@@ -6203,7 +6122,8 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (decode_attr_length(xdr, &len, &p) < 0)
                goto out_overflow;
-        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
+        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+                                        entry->server, 1) < 0)
                goto out_overflow;
        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
                entry->ino = entry->fattr->fileid;
@@ -6215,17 +6135,11 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (verify_attr_len(xdr, p, len) < 0)
                goto out_overflow;
-        p = xdr_inline_peek(xdr, 8);
+        return 0;
-        if (p != NULL)
-                entry->eof = !p[0] && p[1];
-        else
-                entry->eof = 0;
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EAGAIN);
+        return -EAGAIN;
 }
 /*
@@ -6301,8 +6215,8 @@ nfs4_stat_to_errno(int stat)
 #define PROC(proc, argtype, restype)                            \
 [NFSPROC4_CLNT_##proc] = {                                      \
        .p_proc   = NFSPROC4_COMPOUND,                          \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,            \
+        .p_encode = (kxdreproc_t)nfs4_xdr_##argtype,            \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,            \
+        .p_decode = (kxdrdproc_t)nfs4_xdr_##restype,            \
        .p_arglen = NFS4_##argtype##_sz,                        \
        .p_replen = NFS4_##restype##_sz,                        \
        .p_statidx = NFSPROC4_CLNT_##proc,                      \
@@ -6310,50 +6224,50 @@ nfs4_stat_to_errno(int stat)
 }
 struct rpc_procinfo     nfs4_procedures[] = {
-  PROC(READ,            enc_read,       dec_read),
+        PROC(READ,              enc_read,               dec_read),
-  PROC(WRITE,           enc_write,      dec_write),
+        PROC(WRITE,             enc_write,              dec_write),
-  PROC(COMMIT,          enc_commit,     dec_commit),
+        PROC(COMMIT,            enc_commit,             dec_commit),
-  PROC(OPEN,            enc_open,       dec_open),
+        PROC(OPEN,              enc_open,               dec_open),
-  PROC(OPEN_CONFIRM,    enc_open_confirm,       dec_open_confirm),
+        PROC(OPEN_CONFIRM,      enc_open_confirm,       dec_open_confirm),
-  PROC(OPEN_NOATTR,     enc_open_noattr,        dec_open_noattr),
+        PROC(OPEN_NOATTR,       enc_open_noattr,        dec_open_noattr),
-  PROC(OPEN_DOWNGRADE,  enc_open_downgrade,     dec_open_downgrade),
+        PROC(OPEN_DOWNGRADE,    enc_open_downgrade,     dec_open_downgrade),
-  PROC(CLOSE,           enc_close,      dec_close),
+        PROC(CLOSE,             enc_close,              dec_close),
-  PROC(SETATTR,         enc_setattr,    dec_setattr),
+        PROC(SETATTR,           enc_setattr,            dec_setattr),
-  PROC(FSINFO,          enc_fsinfo,     dec_fsinfo),
+        PROC(FSINFO,            enc_fsinfo,             dec_fsinfo),
-  PROC(RENEW,           enc_renew,      dec_renew),
+        PROC(RENEW,             enc_renew,              dec_renew),
-  PROC(SETCLIENTID,     enc_setclientid,        dec_setclientid),
+        PROC(SETCLIENTID,       enc_setclientid,        dec_setclientid),
-  PROC(SETCLIENTID_CONFIRM,     enc_setclientid_confirm,        dec_setclientid_confirm),
+        PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
-  PROC(LOCK,            enc_lock,       dec_lock),
+        PROC(LOCK,              enc_lock,               dec_lock),
-  PROC(LOCKT,           enc_lockt,      dec_lockt),
+        PROC(LOCKT,             enc_lockt,              dec_lockt),
-  PROC(LOCKU,           enc_locku,      dec_locku),
+        PROC(LOCKU,             enc_locku,              dec_locku),
-  PROC(ACCESS,          enc_access,     dec_access),
+        PROC(ACCESS,            enc_access,             dec_access),
-  PROC(GETATTR,         enc_getattr,    dec_getattr),
+        PROC(GETATTR,           enc_getattr,            dec_getattr),
-  PROC(LOOKUP,          enc_lookup,     dec_lookup),
+        PROC(LOOKUP,            enc_lookup,             dec_lookup),
-  PROC(LOOKUP_ROOT,     enc_lookup_root,        dec_lookup_root),
+        PROC(LOOKUP_ROOT,       enc_lookup_root,        dec_lookup_root),
-  PROC(REMOVE,          enc_remove,     dec_remove),
+        PROC(REMOVE,            enc_remove,             dec_remove),
-  PROC(RENAME,          enc_rename,     dec_rename),
+        PROC(RENAME,            enc_rename,             dec_rename),
-  PROC(LINK,            enc_link,       dec_link),
+        PROC(LINK,              enc_link,               dec_link),
-  PROC(SYMLINK,         enc_symlink,    dec_symlink),
+        PROC(SYMLINK,           enc_symlink,            dec_symlink),
-  PROC(CREATE,          enc_create,     dec_create),
+        PROC(CREATE,            enc_create,             dec_create),
-  PROC(PATHCONF,        enc_pathconf,   dec_pathconf),
+        PROC(PATHCONF,          enc_pathconf,           dec_pathconf),
-  PROC(STATFS,          enc_statfs,     dec_statfs),
+        PROC(STATFS,            enc_statfs,             dec_statfs),
-  PROC(READLINK,        enc_readlink,   dec_readlink),
+        PROC(READLINK,          enc_readlink,           dec_readlink),
-  PROC(READDIR,         enc_readdir,    dec_readdir),
+        PROC(READDIR,           enc_readdir,            dec_readdir),
-  PROC(SERVER_CAPS,     enc_server_caps, dec_server_caps),
+        PROC(SERVER_CAPS,       enc_server_caps,        dec_server_caps),
-  PROC(DELEGRETURN,     enc_delegreturn, dec_delegreturn),
+        PROC(DELEGRETURN,       enc_delegreturn,        dec_delegreturn),
-  PROC(GETACL,          enc_getacl,     dec_getacl),
+        PROC(GETACL,            enc_getacl,             dec_getacl),
-  PROC(SETACL,          enc_setacl,     dec_setacl),
+        PROC(SETACL,            enc_setacl,             dec_setacl),
-  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
+        PROC(FS_LOCATIONS,      enc_fs_locations,       dec_fs_locations),
-  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
+        PROC(RELEASE_LOCKOWNER, enc_release_lockowner,  dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
-  PROC(EXCHANGE_ID,     enc_exchange_id,        dec_exchange_id),
+        PROC(EXCHANGE_ID,       enc_exchange_id,        dec_exchange_id),
-  PROC(CREATE_SESSION,  enc_create_session,     dec_create_session),
+        PROC(CREATE_SESSION,    enc_create_session,     dec_create_session),
-  PROC(DESTROY_SESSION, enc_destroy_session,    dec_destroy_session),
+        PROC(DESTROY_SESSION,   enc_destroy_session,    dec_destroy_session),
-  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
+        PROC(SEQUENCE,          enc_sequence,           dec_sequence),
-  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
+        PROC(GET_LEASE_TIME,    enc_get_lease_time,     dec_get_lease_time),
-  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+        PROC(RECLAIM_COMPLETE,  enc_reclaim_complete,   dec_reclaim_complete),
-  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+        PROC(GETDEVICEINFO,     enc_getdeviceinfo,      dec_getdeviceinfo),
-  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
+        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b68536cc9046..e1164e3f9e69 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep;
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
-        struct nfs_page *p;
+        struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
-        p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
+        if (p)
-        if (p) {
-                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->wb_list);
-        }
        return p;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db773428f95f..bc4089769735 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,105 +177,149 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 * pNFS client layout cache
 */
+/* Need to hold i_lock if caller does not already hold reference */
+void
+get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        atomic_inc(&lo->plh_refcount);
+}
 static void
-get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        assert_spin_locked(&lo->inode->i_lock);
+        dprintk("%s: freeing layout cache %p\n", __func__, lo);
-        lo->refcount++;
+        BUG_ON(!list_empty(&lo->plh_layouts));
+        NFS_I(lo->plh_inode)->layout = NULL;
+        kfree(lo);
 }
 static void
 put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 {
-        assert_spin_locked(&lo->inode->i_lock);
+        if (atomic_dec_and_test(&lo->plh_refcount))
-        BUG_ON(lo->refcount == 0);
+                destroy_layout_hdr(lo);
-        lo->refcount--;
-        if (!lo->refcount) {
-                dprintk("%s: freeing layout cache %p\n", __func__, lo);
-                BUG_ON(!list_empty(&lo->layouts));
-                NFS_I(lo->inode)->layout = NULL;
-                kfree(lo);
-        }
 }
 void
-put_layout_hdr(struct inode *inode)
+put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        spin_lock(&inode->i_lock);
+        struct inode *inode = lo->plh_inode;
-        put_layout_hdr_locked(NFS_I(inode)->layout);
-        spin_unlock(&inode->i_lock);
+        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+                destroy_layout_hdr(lo);
+                spin_unlock(&inode->i_lock);
+        }
 }
 static void
 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 {
-        INIT_LIST_HEAD(&lseg->fi_list);
+        INIT_LIST_HEAD(&lseg->pls_list);
-        kref_init(&lseg->kref);
+        atomic_set(&lseg->pls_refcount, 1);
-        lseg->layout = lo;
+        smp_mb();
+        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+        lseg->pls_layout = lo;
 }
-/* Called without i_lock held, as the free_lseg call may sleep */
+static void free_lseg(struct pnfs_layout_segment *lseg)
-static void
-destroy_lseg(struct kref *kref)
 {
-        struct pnfs_layout_segment *lseg =
+        struct inode *ino = lseg->pls_layout->plh_inode;
-                container_of(kref, struct pnfs_layout_segment, kref);
-        struct inode *ino = lseg->layout->inode;
-        dprintk("--> %s\n", __func__);
        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
-        /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+        /* Matched by get_layout_hdr in pnfs_insert_layout */
-        put_layout_hdr(ino);
+        put_layout_hdr(NFS_I(ino)->layout);
 }
-static void
+/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
-put_lseg(struct pnfs_layout_segment *lseg)
+ * could sleep, so must be called outside of the lock.
+ * Returns 1 if object was removed, otherwise return 0.
+ */
+static int
+put_lseg_locked(struct pnfs_layout_segment *lseg,
+                struct list_head *tmp_list)
+{
+        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+                atomic_read(&lseg->pls_refcount),
+                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+        if (atomic_dec_and_test(&lseg->pls_refcount)) {
+                struct inode *ino = lseg->pls_layout->plh_inode;
+                BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+                list_del(&lseg->pls_list);
+                if (list_empty(&lseg->pls_layout->plh_segs)) {
+                        struct nfs_client *clp;
+                        clp = NFS_SERVER(ino)->nfs_client;
+                        spin_lock(&clp->cl_lock);
+                        /* List does not take a reference, so no need for put here */
+                        list_del_init(&lseg->pls_layout->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
+                }
+                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+                list_add(&lseg->pls_list, tmp_list);
+                return 1;
+        }
+        return 0;
+}
+static bool
+should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
 {
-        if (!lseg)
+        return (recall_iomode == IOMODE_ANY ||
-                return;
+                lseg_iomode == recall_iomode);
+}
-        dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+/* Returns 1 if lseg is removed from list, 0 otherwise */
-                atomic_read(&lseg->kref.refcount));
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
-        kref_put(&lseg->kref, destroy_lseg);
+                             struct list_head *tmp_list)
+{
+        int rv = 0;
+        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+                /* Remove the reference keeping the lseg in the
+                 * list.  It will now be removed when all
+                 * outstanding io is finished.
+                 */
+                rv = put_lseg_locked(lseg, tmp_list);
+        }
+        return rv;
 }
-static void
+/* Returns count of number of matching invalid lsegs remaining in list
-pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+ * after call.
+ */
+int
+mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                            struct list_head *tmp_list,
+                            u32 iomode)
 {
        struct pnfs_layout_segment *lseg, *next;
-        struct nfs_client *clp;
+        int invalid = 0, removed = 0;
        dprintk("%s:Begin lo %p\n", __func__, lo);
-        assert_spin_locked(&lo->inode->i_lock);
+        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-        list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
-                dprintk("%s: freeing lseg %p\n", __func__, lseg);
+                        dprintk("%s: freeing lseg %p iomode %d "
-                list_move(&lseg->fi_list, tmp_list);
+                                "offset %llu length %llu\n", __func__,
-        }
+                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
-        clp = NFS_SERVER(lo->inode)->nfs_client;
+                                lseg->pls_range.length);
-        spin_lock(&clp->cl_lock);
+                        invalid++;
-        /* List does not take a reference, so no need for put here */
+                        removed += mark_lseg_invalid(lseg, tmp_list);
-        list_del_init(&lo->layouts);
+                }
-        spin_unlock(&clp->cl_lock);
+        dprintk("%s:Return %i\n", __func__, invalid - removed);
-        write_seqlock(&lo->seqlock);
+        return invalid - removed;
-        clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-        write_sequnlock(&lo->seqlock);
-        dprintk("%s:Return\n", __func__);
 }
-static void
+void
-pnfs_free_lseg_list(struct list_head *tmp_list)
+pnfs_free_lseg_list(struct list_head *free_me)
 {
-        struct pnfs_layout_segment *lseg;
+        struct pnfs_layout_segment *lseg, *tmp;
-        while (!list_empty(tmp_list)) {
+        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
-                lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
+                list_del(&lseg->pls_list);
-                                fi_list);
+                free_lseg(lseg);
-                dprintk("%s calling put_lseg on %p\n", __func__, lseg);
-                list_del(&lseg->fi_list);
-                put_lseg(lseg);
        }
 }
@@ -288,7 +332,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        spin_lock(&nfsi->vfs_inode.i_lock);
        lo = nfsi->layout;
        if (lo) {
-                pnfs_clear_lseg_list(lo, &tmp_list);
+                set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
                put_layout_hdr_locked(lo);
        }
@@ -312,76 +357,80 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        while (!list_empty(&tmp_list)) {
                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
-                                layouts);
+                                plh_layouts);
                dprintk("%s freeing layout for inode %lu\n", __func__,
-                        lo->inode->i_ino);
+                        lo->plh_inode->i_ino);
-                pnfs_destroy_layout(NFS_I(lo->inode));
+                pnfs_destroy_layout(NFS_I(lo->plh_inode));
        }
 }
-/* update lo->stateid with new if is more recent
+/* update lo->plh_stateid with new if is more recent */
- *
+void
- * lo->stateid could be the open stateid, in which case we just use what given.
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
- */
+                        bool update_barrier)
-static void
+{
-pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+        u32 oldseq, newseq;
-                        const nfs4_stateid *new)
-{
+        oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
-        nfs4_stateid *old = &lo->stateid;
+        newseq = be32_to_cpu(new->stateid.seqid);
-        bool overwrite = false;
+        if ((int)(newseq - oldseq) > 0) {
+                memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
-        write_seqlock(&lo->seqlock);
+                if (update_barrier) {
-        if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+                        u32 new_barrier = be32_to_cpu(new->stateid.seqid);
-            memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
-                overwrite = true;
+                        if ((int)(new_barrier - lo->plh_barrier))
-        else {
+                                lo->plh_barrier = new_barrier;
-                u32 oldseq, newseq;
+                } else {
+                        /* Because of wraparound, we want to keep the barrier
-                oldseq = be32_to_cpu(old->stateid.seqid);
+                         * "close" to the current seqids.  It needs to be
-                newseq = be32_to_cpu(new->stateid.seqid);
+                         * within 2**31 to count as "behind", so if it
-                if ((int)(newseq - oldseq) > 0)
+                         * gets too near that limit, give us a litle leeway
-                        overwrite = true;
+                         * and bring it to within 2**30.
+                         * NOTE - and yes, this is all unsigned arithmetic.
+                         */
+                        if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+                                lo->plh_barrier = newseq - (1 << 30);
+                }
        }
-        if (overwrite)
-                memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
-        write_sequnlock(&lo->seqlock);
 }
-static void
+/* lget is set to 1 if called from inside send_layoutget call chain */
-pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
+static bool
-                              struct nfs4_state *state)
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+                        int lget)
 {
-        int seq;
+        if ((stateid) &&
+            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
-        dprintk("--> %s\n", __func__);
+                return true;
-        write_seqlock(&lo->seqlock);
+        return lo->plh_block_lgets ||
-        do {
+                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-                seq = read_seqbegin(&state->seqlock);
+                (list_empty(&lo->plh_segs) &&
-                memcpy(lo->stateid.data, state->stateid.data,
+                 (atomic_read(&lo->plh_outstanding) > lget));
-                       sizeof(state->stateid.data));
-        } while (read_seqretry(&state->seqlock, seq));
-        set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-        write_sequnlock(&lo->seqlock);
-        dprintk("<-- %s\n", __func__);
 }
-void
+int
-pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-                        struct nfs4_state *open_state)
+                              struct nfs4_state *open_state)
 {
-        int seq;
+        int status = 0;
        dprintk("--> %s\n", __func__);
-        do {
+        spin_lock(&lo->plh_inode->i_lock);
-                seq = read_seqbegin(&lo->seqlock);
+        if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
-                if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+                status = -EAGAIN;
-                        /* This will trigger retry of the read */
+        } else if (list_empty(&lo->plh_segs)) {
-                        pnfs_layout_from_open_stateid(lo, open_state);
+                int seq;
-                } else
-                        memcpy(dst->data, lo->stateid.data,
+                do {
-                               sizeof(lo->stateid.data));
+                        seq = read_seqbegin(&open_state->seqlock);
-        } while (read_seqretry(&lo->seqlock, seq));
+                        memcpy(dst->data, open_state->stateid.data,
+                               sizeof(open_state->stateid.data));
+                } while (read_seqretry(&open_state->seqlock, seq));
+        } else
+                memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+        spin_unlock(&lo->plh_inode->i_lock);
        dprintk("<-- %s\n", __func__);
+        return status;
 }
 /*
@@ -395,7 +444,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
           struct nfs_open_context *ctx,
           u32 iomode)
 {
-        struct inode *ino = lo->inode;
+        struct inode *ino = lo->plh_inode;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
        struct pnfs_layout_segment *lseg = NULL;
@@ -404,10 +453,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        BUG_ON(ctx == NULL);
        lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
-        if (lgp == NULL) {
+        if (lgp == NULL)
-                put_layout_hdr(lo->inode);
                return NULL;
-        }
        lgp->args.minlength = NFS4_MAX_UINT64;
        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
        lgp->args.range.iomode = iomode;
@@ -424,11 +471,88 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        nfs4_proc_layoutget(lgp);
        if (!lseg) {
                /* remember that LAYOUTGET failed and suspend trying */
-                set_bit(lo_fail_bit(iomode), &lo->state);
+                set_bit(lo_fail_bit(iomode), &lo->plh_flags);
        }
        return lseg;
 }
+bool pnfs_roc(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        struct pnfs_layout_segment *lseg, *tmp;
+        LIST_HEAD(tmp_list);
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+                goto out_nolayout;
+        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        mark_lseg_invalid(lseg, &tmp_list);
+                        found = true;
+                }
+        if (!found)
+                goto out_nolayout;
+        lo->plh_block_lgets++;
+        get_layout_hdr(lo); /* matched in pnfs_roc_release */
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&tmp_list);
+        return true;
+out_nolayout:
+        spin_unlock(&ino->i_lock);
+        return false;
+}
+void pnfs_roc_release(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        lo->plh_block_lgets--;
+        put_layout_hdr_locked(lo);
+        spin_unlock(&ino->i_lock);
+}
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if ((int)(barrier - lo->plh_barrier) > 0)
+                lo->plh_barrier = barrier;
+        spin_unlock(&ino->i_lock);
+}
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_segment *lseg;
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        found = true;
+                        break;
+                }
+        if (!found) {
+                struct pnfs_layout_hdr *lo = nfsi->layout;
+                u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+                /* Since close does not return a layout stateid for use as
+                 * a barrier, we choose the worst-case barrier.
+                 */
+                *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+        }
+        spin_unlock(&ino->i_lock);
+        return found;
+}
 /*
 * Compare two layout segments for sorting into layout cache.
 * We want to preferentially return RW over RO layouts, so ensure those
@@ -450,37 +574,29 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->inode->i_lock);
+        assert_spin_locked(&lo->plh_inode->i_lock);
-        if (list_empty(&lo->segs)) {
+        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-                struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+                if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
-                spin_lock(&clp->cl_lock);
-                BUG_ON(!list_empty(&lo->layouts));
-                list_add_tail(&lo->layouts, &clp->cl_layouts);
-                spin_unlock(&clp->cl_lock);
-        }
-        list_for_each_entry(lp, &lo->segs, fi_list) {
-                if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
                        continue;
-                list_add_tail(&lseg->fi_list, &lp->fi_list);
+                list_add_tail(&lseg->pls_list, &lp->pls_list);
                dprintk("%s: inserted lseg %p "
                        "iomode %d offset %llu length %llu before "
                        "lp %p iomode %d offset %llu length %llu\n",
-                        __func__, lseg, lseg->range.iomode,
+                        __func__, lseg, lseg->pls_range.iomode,
-                        lseg->range.offset, lseg->range.length,
+                        lseg->pls_range.offset, lseg->pls_range.length,
-                        lp, lp->range.iomode, lp->range.offset,
+                        lp, lp->pls_range.iomode, lp->pls_range.offset,
-                        lp->range.length);
+                        lp->pls_range.length);
                found = 1;
                break;
        }
        if (!found) {
-                list_add_tail(&lseg->fi_list, &lo->segs);
+                list_add_tail(&lseg->pls_list, &lo->plh_segs);
                dprintk("%s: inserted lseg %p "
                        "iomode %d offset %llu length %llu at tail\n",
-                        __func__, lseg, lseg->range.iomode,
+                        __func__, lseg, lseg->pls_range.iomode,
-                        lseg->range.offset, lseg->range.length);
+                        lseg->pls_range.offset, lseg->pls_range.length);
        }
-        get_layout_hdr_locked(lo);
+        get_layout_hdr(lo);
        dprintk("%s:Return\n", __func__);
 }
@@ -493,11 +609,11 @@ alloc_init_layout_hdr(struct inode *ino)
        lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
        if (!lo)
                return NULL;
-        lo->refcount = 1;
+        atomic_set(&lo->plh_refcount, 1);
-        INIT_LIST_HEAD(&lo->layouts);
+        INIT_LIST_HEAD(&lo->plh_layouts);
-        INIT_LIST_HEAD(&lo->segs);
+        INIT_LIST_HEAD(&lo->plh_segs);
-        seqlock_init(&lo->seqlock);
+        INIT_LIST_HEAD(&lo->plh_bulk_recall);
-        lo->inode = ino;
+        lo->plh_inode = ino;
        return lo;
 }
@@ -510,9 +626,12 @@ pnfs_find_alloc_layout(struct inode *ino)
        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
        assert_spin_locked(&ino->i_lock);
-        if (nfsi->layout)
+        if (nfsi->layout) {
-                return nfsi->layout;
+                if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+                        return NULL;
+                else
+                        return nfsi->layout;
+        }
        spin_unlock(&ino->i_lock);
        new = alloc_init_layout_hdr(ino);
        spin_lock(&ino->i_lock);
@@ -538,31 +657,32 @@ pnfs_find_alloc_layout(struct inode *ino)
 static int
 is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
 {
-        return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+        return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
 }
 /*
 * lookup range in layout
 */
 static struct pnfs_layout_segment *
-pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 {
        struct pnfs_layout_segment *lseg, *ret = NULL;
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->inode->i_lock);
+        assert_spin_locked(&lo->plh_inode->i_lock);
-        list_for_each_entry(lseg, &lo->segs, fi_list) {
+        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
-                if (is_matching_lseg(lseg, iomode)) {
+                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+                    is_matching_lseg(lseg, iomode)) {
                        ret = lseg;
                        break;
                }
-                if (cmp_layout(iomode, lseg->range.iomode) > 0)
+                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
                        break;
        }
        dprintk("%s:Return lseg %p ref %d\n",
-                __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
        return ret;
 }
@@ -576,6 +696,7 @@ pnfs_update_layout(struct inode *ino,
                   enum pnfs_iomode iomode)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg = NULL;
@@ -588,25 +709,53 @@ pnfs_update_layout(struct inode *ino,
                goto out_unlock;
        }
-        /* Check to see if the layout for the given range already exists */
+        /* Do we even need to bother with this? */
-        lseg = pnfs_has_layout(lo, iomode);
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
-        if (lseg) {
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-                dprintk("%s: Using cached lseg %p for iomode %d)\n",
+                dprintk("%s matches recall, use MDS\n", __func__);
-                        __func__, lseg, iomode);
                goto out_unlock;
        }
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_find_lseg(lo, iomode);
+        if (lseg)
+                goto out_unlock;
        /* if LAYOUTGET already failed once we don't try again */
-        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+                goto out_unlock;
+        if (pnfs_layoutgets_blocked(lo, NULL, 0))
                goto out_unlock;
+        atomic_inc(&lo->plh_outstanding);
-        get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+        get_layout_hdr(lo);
+        if (list_empty(&lo->plh_segs)) {
+                /* The lo must be on the clp list if there is any
+                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+                 */
+                spin_lock(&clp->cl_lock);
+                BUG_ON(!list_empty(&lo->plh_layouts));
+                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
        spin_unlock(&ino->i_lock);
        lseg = send_layoutget(lo, ctx, iomode);
+        if (!lseg) {
+                spin_lock(&ino->i_lock);
+                if (list_empty(&lo->plh_segs)) {
+                        spin_lock(&clp->cl_lock);
+                        list_del_init(&lo->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                }
+                spin_unlock(&ino->i_lock);
+        }
+        atomic_dec(&lo->plh_outstanding);
+        put_layout_hdr(lo);
 out:
        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-                nfsi->layout->state, lseg);
+                nfsi->layout->plh_flags, lseg);
        return lseg;
 out_unlock:
        spin_unlock(&ino->i_lock);
@@ -619,9 +768,21 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
        struct nfs4_layoutget_res *res = &lgp->res;
        struct pnfs_layout_segment *lseg;
-        struct inode *ino = lo->inode;
+        struct inode *ino = lo->plh_inode;
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        int status = 0;
+        /* Verify we got what we asked for.
+         * Note that because the xdr parsing only accepts a single
+         * element array, this can fail even if the server is behaving
+         * correctly.
+         */
+        if (lgp->args.range.iomode > res->range.iomode ||
+            res->range.offset != 0 ||
+            res->range.length != NFS4_MAX_UINT64) {
+                status = -EINVAL;
+                goto out;
+        }
        /* Inject layout blob into I/O device driver */
        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
        if (!lseg || IS_ERR(lseg)) {
@@ -635,16 +796,37 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        }
        spin_lock(&ino->i_lock);
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+                dprintk("%s forget reply due to recall\n", __func__);
+                goto out_forget_reply;
+        }
+        if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+                dprintk("%s forget reply due to state\n", __func__);
+                goto out_forget_reply;
+        }
        init_lseg(lo, lseg);
-        lseg->range = res->range;
+        lseg->pls_range = res->range;
        *lgp->lsegpp = lseg;
        pnfs_insert_layout(lo, lseg);
+        if (res->return_on_close) {
+                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+                set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+        }
        /* Done processing layoutget. Set the layout stateid */
-        pnfs_set_layout_stateid(lo, &res->stateid);
+        pnfs_set_layout_stateid(lo, &res->stateid, false);
        spin_unlock(&ino->i_lock);
 out:
        return status;
+out_forget_reply:
+        spin_unlock(&ino->i_lock);
+        lseg->pls_layout = lo;
+        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+        goto out;
 }
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e12367d50489..e2612ea0cbed 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,11 +30,17 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
+enum {
+        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
+        NFS_LSEG_ROC,           /* roc bit received from server */
+};
 struct pnfs_layout_segment {
-        struct list_head fi_list;
+        struct list_head pls_list;
-        struct pnfs_layout_range range;
+        struct pnfs_layout_range pls_range;
-        struct kref kref;
+        atomic_t pls_refcount;
-        struct pnfs_layout_hdr *layout;
+        unsigned long pls_flags;
+        struct pnfs_layout_hdr *pls_layout;
 };
 #ifdef CONFIG_NFS_V4_1
@@ -44,7 +50,9 @@ struct pnfs_layout_segment {
 enum {
        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
-        NFS_LAYOUT_STATEID_SET,         /* have a valid layout stateid */
+        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
+        NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
+        NFS_LAYOUT_DESTROYED,           /* no new use of layout allowed */
 };
 /* Per-layout driver specific registration structure */
@@ -60,13 +68,16 @@ struct pnfs_layoutdriver_type {
 };
 struct pnfs_layout_hdr {
-        unsigned long           refcount;
+        atomic_t                plh_refcount;
-        struct list_head        layouts;   /* other client layouts */
+        struct list_head        plh_layouts;   /* other client layouts */
-        struct list_head        segs;      /* layout segments list */
+        struct list_head        plh_bulk_recall; /* clnt list of bulk recalls */
-        seqlock_t               seqlock;   /* Protects the stateid */
+        struct list_head        plh_segs;      /* layout segments list */
-        nfs4_stateid            stateid;
+        nfs4_stateid            plh_stateid;
-        unsigned long           state;
+        atomic_t                plh_outstanding; /* number of RPCs out */
-        struct inode            *inode;
+        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
+        u32                     plh_barrier; /* ignore lower seqids */
+        unsigned long           plh_flags;
+        struct inode            *plh_inode;
 };
 struct pnfs_device {
@@ -134,17 +145,30 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 /* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
-void put_layout_hdr(struct inode *inode);
+void put_layout_hdr(struct pnfs_layout_hdr *lo);
-void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
-                             struct nfs4_state *open_state);
+                             const nfs4_stateid *new,
+                             bool update_barrier);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+                                  struct pnfs_layout_hdr *lo,
+                                  struct nfs4_state *open_state);
+int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                                struct list_head *tmp_list,
+                                u32 iomode);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 static inline int lo_fail_bit(u32 iomode)
@@ -176,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
        return NULL;
 }
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+        return false;
+}
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        return false;
+}
 static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
 {
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 58e7f84fc1fd..77d5e21c4ad6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -458,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        fattr = nfs_alloc_fattr();
        status = -ENOMEM;
        if (fh == NULL || fattr == NULL)
-                goto out;
+                goto out_free;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
@@ -471,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        if (status == 0)
                status = nfs_instantiate(dentry, fh, fattr);
+out_free:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fh);
 out:
@@ -731,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .statfs         = nfs_proc_statfs,
        .fsinfo         = nfs_proc_fsinfo,
        .pathconf       = nfs_proc_pathconf,
-        .decode_dirent  = nfs_decode_dirent,
+        .decode_dirent  = nfs2_decode_dirent,
        .read_setup     = nfs_proc_read_setup,
        .read_done      = nfs_read_done,
        .write_setup    = nfs_proc_write_setup,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4100630c9a5b..b68c8607770f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -598,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->mountd_version || showdefaults)
                seq_printf(m, ",mountvers=%u", nfss->mountd_version);
-        if (nfss->mountd_port || showdefaults)
+        if ((nfss->mountd_port &&
+                nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+                showdefaults)
                seq_printf(m, ",mountport=%u", nfss->mountd_port);
        nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -2200,6 +2202,7 @@ static int nfs_set_super(struct super_block *s, void *data)
        s->s_flags = sb_mntdata->mntflags;
        s->s_fs_info = server;
+        s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
        ret = set_anon_super(s, server);
        if (ret == 0)
                server->s_dev = s->s_dev;
@@ -2494,7 +2497,13 @@ static void nfs4_clone_super(struct super_block *sb,
        sb->s_maxbytes = old_sb->s_maxbytes;
        sb->s_time_gran = 1;
        sb->s_op = old_sb->s_op;
-        nfs_initialise_sb(sb);
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr  = old_sb->s_xattr;
+        nfs_initialise_sb(sb);
 }
 /*
@@ -2504,6 +2513,12 @@ static void nfs4_fill_super(struct super_block *sb)
 {
        sb->s_time_gran = 1;
        sb->s_op = &nfs4_sops;
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr = nfs4_xattr_handlers;
        nfs_initialise_sb(sb);
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 8fe9eb47a97f..e313a51acdd1 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -429,7 +429,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
                return ERR_PTR(-ENOMEM);
-        task_setup_data.callback_data = data,
+        task_setup_data.callback_data = data;
        data->cred = rpc_lookup_cred();
        if (IS_ERR(data->cred)) {
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 000000000000..34e5c40af5ef
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
+/*
+ *  Common NFSv4 ACL handling definitions.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+#include <linux/posix_acl.h>
+/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
+ * fit in a page: */
+#define NFS4_ACL_MAX 170
+struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_get_whotype(char *, u32);
+int nfs4_acl_write_who(int who, char *p);
+int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
+                                        uid_t who, u32 mask);
+#define NFS4_ACL_TYPE_DEFAULT   0x01
+#define NFS4_ACL_DIR            0x02
+#define NFS4_ACL_OWNER          0x04
+struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
+                                struct posix_acl *, unsigned int flags);
+int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
+                                struct posix_acl **, unsigned int flags);
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c0fcb7ab7f6d..8b31e5f8795d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
-#define MSNFS   /* HACK HACK */
 /*
 * NFS exporting and validation.
 *
@@ -1444,9 +1443,6 @@ static struct flags {
        { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
        { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
        { NFSEXP_V4ROOT, {"v4root", ""}},
-#ifdef MSNFS
-        { NFSEXP_MSNFS, {"msnfs", ""}},
-#endif
        { 0, {"", ""}}
 };
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 000000000000..2f3be1321534
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
+/*
+ *  Mapping of UID to name and vice versa.
+ *
+ *  Copyright (c) 2002, 2003 The Regents of the University of
+ *  Michigan.  All rights reserved.
+> *
+ *  Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+/* XXX from linux/nfs_idmap.h */
+#define IDMAP_NAMESZ 128
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(void);
+void nfsd_idmap_shutdown(void);
+#else
+static inline int nfsd_idmap_init(void)
+{
+        return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
+#endif
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06b..2247fc91d5e9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
        __be32  nfserr;
        u32     max_blocksize = svc_max_payload(rqstp);
-        dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
+        dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
                                SVCFH_fmt(&argp->fh),
                                (unsigned long) argp->count,
-                                (unsigned long) argp->offset);
+                                (unsigned long long) argp->offset);
        /* Obtain buffer pointer for payload.
         * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
        __be32  nfserr;
        unsigned long cnt = argp->len;
-        dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
+        dprintk("nfsd: WRITE(3)    %s %d bytes at %Lu%s\n",
                                SVCFH_fmt(&argp->fh),
                                argp->len,
-                                (unsigned long) argp->offset,
+                                (unsigned long long) argp->offset,
                                argp->stable? " stable" : "");
        fh_copy(&resp->fh, &argp->fh);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e48052615159..ad88f1c0a4c3 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
 #include <linux/slab.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs4_acl.h>
+#include "acl.h"
 /* mode bit translations: */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 143da2eecd7b..3be975e18919 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -50,11 +50,6 @@ enum {
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
-enum nfs_cb_opnum4 {
-        OP_CB_RECALL            = 4,
-        OP_CB_SEQUENCE          = 11,
-};
 #define NFS4_MAXTAGLEN          20
 #define NFS4_enc_cb_null_sz             0
@@ -79,61 +74,6 @@ enum nfs_cb_opnum4 {
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
-/*
-* Generic encode routines from fs/nfs/nfs4xdr.c
-*/
-static inline __be32 *
-xdr_writemem(__be32 *p, const void *ptr, int nbytes)
-{
-        int tmp = XDR_QUADLEN(nbytes);
-        if (!tmp)
-                return p;
-        p[tmp-1] = 0;
-        memcpy(p, ptr, nbytes);
-        return p + tmp;
-}
-#define WRITE32(n)               *p++ = htonl(n)
-#define WRITEMEM(ptr,nbytes)     do {                           \
-        p = xdr_writemem(p, ptr, nbytes);                       \
-} while (0)
-#define RESERVE_SPACE(nbytes)   do {                            \
-        p = xdr_reserve_space(xdr, nbytes);                     \
-        if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
-        BUG_ON(!p);                                             \
-} while (0)
-/*
- * Generic decode routines from fs/nfs/nfs4xdr.c
- */
-#define DECODE_TAIL                             \
-        status = 0;                             \
-out:                                            \
-        return status;                          \
-xdr_error:                                      \
-        dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
-        status = -EIO;                          \
-        goto out
-#define READ32(x)         (x) = ntohl(*p++)
-#define READ64(x)         do {                  \
-        (x) = (u64)ntohl(*p++) << 32;           \
-        (x) |= ntohl(*p++);                     \
-} while (0)
-#define READTIME(x)       do {                  \
-        p++;                                    \
-        (x.tv_sec) = ntohl(*p++);               \
-        (x.tv_nsec) = ntohl(*p++);              \
-} while (0)
-#define READ_BUF(nbytes)  do { \
-        p = xdr_inline_decode(xdr, nbytes); \
-        if (!p) { \
-                dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
-                        __func__, __LINE__); \
-                return -EIO; \
-        } \
-} while (0)
 struct nfs4_cb_compound_hdr {
        /* args */
        u32             ident;  /* minorversion 0 only */
@@ -144,295 +84,513 @@ struct nfs4_cb_compound_hdr {
        int             status;
 };
-static struct {
+/*
-int stat;
+ * Handle decode buffer overflows out-of-line.
-int errno;
+ */
-} nfs_cb_errtbl[] = {
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-        { NFS4_OK,              0               },
+{
-        { NFS4ERR_PERM,         EPERM           },
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-        { NFS4ERR_NOENT,        ENOENT          },
+                "Remaining buffer length is %tu words.\n",
-        { NFS4ERR_IO,           EIO             },
+                func, xdr->end - xdr->p);
-        { NFS4ERR_NXIO,         ENXIO           },
+}
-        { NFS4ERR_ACCESS,       EACCES          },
-        { NFS4ERR_EXIST,        EEXIST          },
-        { NFS4ERR_XDEV,         EXDEV           },
-        { NFS4ERR_NOTDIR,       ENOTDIR         },
-        { NFS4ERR_ISDIR,        EISDIR          },
-        { NFS4ERR_INVAL,        EINVAL          },
-        { NFS4ERR_FBIG,         EFBIG           },
-        { NFS4ERR_NOSPC,        ENOSPC          },
-        { NFS4ERR_ROFS,         EROFS           },
-        { NFS4ERR_MLINK,        EMLINK          },
-        { NFS4ERR_NAMETOOLONG,  ENAMETOOLONG    },
-        { NFS4ERR_NOTEMPTY,     ENOTEMPTY       },
-        { NFS4ERR_DQUOT,        EDQUOT          },
-        { NFS4ERR_STALE,        ESTALE          },
-        { NFS4ERR_BADHANDLE,    EBADHANDLE      },
-        { NFS4ERR_BAD_COOKIE,   EBADCOOKIE      },
-        { NFS4ERR_NOTSUPP,      ENOTSUPP        },
-        { NFS4ERR_TOOSMALL,     ETOOSMALL       },
-        { NFS4ERR_SERVERFAULT,  ESERVERFAULT    },
-        { NFS4ERR_BADTYPE,      EBADTYPE        },
-        { NFS4ERR_LOCKED,       EAGAIN          },
-        { NFS4ERR_RESOURCE,     EREMOTEIO       },
-        { NFS4ERR_SYMLINK,      ELOOP           },
-        { NFS4ERR_OP_ILLEGAL,   EOPNOTSUPP      },
-        { NFS4ERR_DEADLOCK,     EDEADLK         },
-        { -1,                   EIO             }
-};
-static int
+static __be32 *xdr_encode_empty_array(__be32 *p)
-nfs_cb_stat_to_errno(int stat)
 {
-        int i;
+        *p++ = xdr_zero;
-        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+        return p;
-                if (nfs_cb_errtbl[i].stat == stat)
-                        return nfs_cb_errtbl[i].errno;
-        }
-        /* If we cannot translate the error, the recovery routines should
-        * handle it.
-        * Note: remaining NFSv4 error codes have values > 10000, so should
-        * not conflict with native Linux error codes.
-        */
-        return stat;
 }
 /*
- * XDR encode
+ * Encode/decode NFSv4 CB basic data types
+ *
+ * Basic NFSv4 callback data types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section
+ * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
+ * 1 Protocol"
 */
-static void
+/*
-encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+ *      nfs_cb_opnum4
+ *
+ *      enum nfs_cb_opnum4 {
+ *              OP_CB_GETATTR           = 3,
+ *                ...
+ *      };
+ */
+enum nfs_cb_opnum4 {
+        OP_CB_GETATTR                   = 3,
+        OP_CB_RECALL                    = 4,
+        OP_CB_LAYOUTRECALL              = 5,
+        OP_CB_NOTIFY                    = 6,
+        OP_CB_PUSH_DELEG                = 7,
+        OP_CB_RECALL_ANY                = 8,
+        OP_CB_RECALLABLE_OBJ_AVAIL      = 9,
+        OP_CB_RECALL_SLOT               = 10,
+        OP_CB_SEQUENCE                  = 11,
+        OP_CB_WANTS_CANCELLED           = 12,
+        OP_CB_NOTIFY_LOCK               = 13,
+        OP_CB_NOTIFY_DEVICEID           = 14,
+        OP_CB_ILLEGAL                   = 10044
+};
+static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
 {
        __be32 *p;
-        RESERVE_SPACE(sizeof(stateid_t));
+        p = xdr_reserve_space(xdr, 4);
-        WRITE32(sid->si_generation);
+        *p = cpu_to_be32(op);
-        WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
 }
-static void
+/*
-encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
+ * nfs_fh4
+ *
+ *      typedef opaque nfs_fh4<NFS4_FHSIZE>;
+ */
+static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
 {
-        __be32 * p;
+        u32 length = fh->fh_size;
+        __be32 *p;
-        RESERVE_SPACE(16);
+        BUG_ON(length > NFS4_FHSIZE);
-        WRITE32(0);            /* tag length is always 0 */
+        p = xdr_reserve_space(xdr, 4 + length);
-        WRITE32(hdr->minorversion);
+        xdr_encode_opaque(p, &fh->fh_base, length);
-        WRITE32(hdr->ident);
-        hdr->nops_p = p;
-        WRITE32(hdr->nops);
 }
-static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+/*
+ * stateid4
+ *
+ *      struct stateid4 {
+ *              uint32_t        seqid;
+ *              opaque          other[12];
+ *      };
+ */
+static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
 {
-        *hdr->nops_p = htonl(hdr->nops);
+        __be32 *p;
+        p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(sid->si_generation);
+        xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
 }
-static void
+/*
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+ * sessionid4
-                struct nfs4_cb_compound_hdr *hdr)
+ *
+ *      typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
+ */
+static void encode_sessionid4(struct xdr_stream *xdr,
+                              const struct nfsd4_session *session)
 {
        __be32 *p;
-        int len = dp->dl_fh.fh_size;
+        p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
-        RESERVE_SPACE(4);
+        xdr_encode_opaque_fixed(p, session->se_sessionid.data,
-        WRITE32(OP_CB_RECALL);
+                                        NFS4_MAX_SESSIONID_LEN);
-        encode_stateid(xdr, &dp->dl_stateid);
-        RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
-        WRITE32(0); /* truncate optimization not implemented */
-        WRITE32(len);
-        WRITEMEM(&dp->dl_fh.fh_base, len);
-        hdr->nops++;
 }
-static void
+/*
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
+ * nfsstat4
-                   struct nfs4_cb_compound_hdr *hdr)
+ */
-{
+static const struct {
-        __be32 *p;
+        int stat;
-        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+        int errno;
+} nfs_cb_errtbl[] = {
+        { NFS4_OK,              0               },
+        { NFS4ERR_PERM,         -EPERM          },
+        { NFS4ERR_NOENT,        -ENOENT         },
+        { NFS4ERR_IO,           -EIO            },
+        { NFS4ERR_NXIO,         -ENXIO          },
+        { NFS4ERR_ACCESS,       -EACCES         },
+        { NFS4ERR_EXIST,        -EEXIST         },
+        { NFS4ERR_XDEV,         -EXDEV          },
+        { NFS4ERR_NOTDIR,       -ENOTDIR        },
+        { NFS4ERR_ISDIR,        -EISDIR         },
+        { NFS4ERR_INVAL,        -EINVAL         },
+        { NFS4ERR_FBIG,         -EFBIG          },
+        { NFS4ERR_NOSPC,        -ENOSPC         },
+        { NFS4ERR_ROFS,         -EROFS          },
+        { NFS4ERR_MLINK,        -EMLINK         },
+        { NFS4ERR_NAMETOOLONG,  -ENAMETOOLONG   },
+        { NFS4ERR_NOTEMPTY,     -ENOTEMPTY      },
+        { NFS4ERR_DQUOT,        -EDQUOT         },
+        { NFS4ERR_STALE,        -ESTALE         },
+        { NFS4ERR_BADHANDLE,    -EBADHANDLE     },
+        { NFS4ERR_BAD_COOKIE,   -EBADCOOKIE     },
+        { NFS4ERR_NOTSUPP,      -ENOTSUPP       },
+        { NFS4ERR_TOOSMALL,     -ETOOSMALL      },
+        { NFS4ERR_SERVERFAULT,  -ESERVERFAULT   },
+        { NFS4ERR_BADTYPE,      -EBADTYPE       },
+        { NFS4ERR_LOCKED,       -EAGAIN         },
+        { NFS4ERR_RESOURCE,     -EREMOTEIO      },
+        { NFS4ERR_SYMLINK,      -ELOOP          },
+        { NFS4ERR_OP_ILLEGAL,   -EOPNOTSUPP     },
+        { NFS4ERR_DEADLOCK,     -EDEADLK        },
+        { -1,                   -EIO            }
+};
-        if (hdr->minorversion == 0)
+/*
-                return;
+ * If we cannot translate the error, the recovery routines should
+ * handle it.
+ *
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+static int nfs_cb_stat_to_errno(int status)
+{
+        int i;
-        RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
+        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+                if (nfs_cb_errtbl[i].stat == status)
+                        return nfs_cb_errtbl[i].errno;
+        }
-        WRITE32(OP_CB_SEQUENCE);
+        dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
-        WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        return -status;
-        WRITE32(ses->se_cb_seq_nr);
-        WRITE32(0);             /* slotid, always 0 */
-        WRITE32(0);             /* highest slotid always 0 */
-        WRITE32(0);             /* cachethis always 0 */
-        WRITE32(0); /* FIXME: support referring_call_lists */
-        hdr->nops++;
 }
-static int
+static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
-nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
+                               enum nfsstat4 *status)
 {
-        struct xdr_stream xdrs, *xdr = &xdrs;
+        __be32 *p;
+        u32 op;
-        xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        RESERVE_SPACE(0);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        op = be32_to_cpup(p++);
+        if (unlikely(op != expected))
+                goto out_unexpected;
+        *status = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+out_unexpected:
+        dprintk("NFSD: Callback server returned operation %d but "
+                "we issued a request for %d\n", op, expected);
+        return -EIO;
 }
-static int
+/*
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
+ * CB_COMPOUND4args
-                struct nfsd4_callback *cb)
+ *
+ *      struct CB_COMPOUND4args {
+ *              utf8str_cs      tag;
+ *              uint32_t        minorversion;
+ *              uint32_t        callback_ident;
+ *              nfs_cb_argop4   argarray<>;
+ *      };
+*/
+static void encode_cb_compound4args(struct xdr_stream *xdr,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
-        struct xdr_stream xdr;
+        __be32 * p;
-        struct nfs4_delegation *args = cb->cb_op;
-        struct nfs4_cb_compound_hdr hdr = {
-                .ident = cb->cb_clp->cl_cb_ident,
-                .minorversion = cb->cb_minorversion,
-        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
-        encode_cb_compound_hdr(&xdr, &hdr);
+        p = xdr_encode_empty_array(p);          /* empty tag */
-        encode_cb_sequence(&xdr, cb, &hdr);
+        *p++ = cpu_to_be32(hdr->minorversion);
-        encode_cb_recall(&xdr, args, &hdr);
+        *p++ = cpu_to_be32(hdr->ident);
-        encode_cb_nops(&hdr);
+        hdr->nops_p = p;
+        *p = cpu_to_be32(hdr->nops);            /* argarray element count */
+}
+/*
+ * Update argarray element count
+ */
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+        BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
+        *hdr->nops_p = cpu_to_be32(hdr->nops);
+}
+/*
+ * CB_COMPOUND4res
+ *
+ *      struct CB_COMPOUND4res {
+ *              nfsstat4        status;
+ *              utf8str_cs      tag;
+ *              nfs_cb_resop4   resarray<>;
+ *      };
+ */
+static int decode_cb_compound4res(struct xdr_stream *xdr,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->status = be32_to_cpup(p++);
+        /* Ignore the tag */
+        length = be32_to_cpup(p++);
+        p = xdr_inline_decode(xdr, length + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->nops = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+/*
+ * CB_RECALL4args
+ *
+ *      struct CB_RECALL4args {
+ *              stateid4        stateid;
+ *              bool            truncate;
+ *              nfs_fh4         fh;
+ *      };
+ */
+static void encode_cb_recall4args(struct xdr_stream *xdr,
+                                  const struct nfs4_delegation *dp,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        __be32 *p;
+        encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
+        encode_stateid4(xdr, &dp->dl_stateid);
+        p = xdr_reserve_space(xdr, 4);
+        *p++ = xdr_zero;                        /* truncate */
-static int
+        encode_nfs_fh4(xdr, &dp->dl_fh);
-decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
-        __be32 *p;
-        u32 taglen;
-        READ_BUF(8);
+        hdr->nops++;
-        READ32(hdr->status);
-        /* We've got no use for the tag; ignore it: */
-        READ32(taglen);
-        READ_BUF(taglen + 4);
-        p += XDR_QUADLEN(taglen);
-        READ32(hdr->nops);
-        return 0;
 }
-static int
+/*
-decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+ * CB_SEQUENCE4args
+ *
+ *      struct CB_SEQUENCE4args {
+ *              sessionid4              csa_sessionid;
+ *              sequenceid4             csa_sequenceid;
+ *              slotid4                 csa_slotid;
+ *              slotid4                 csa_highest_slotid;
+ *              bool                    csa_cachethis;
+ *              referring_call_list4    csa_referring_call_lists<>;
+ *      };
+ */
+static void encode_cb_sequence4args(struct xdr_stream *xdr,
+                                    const struct nfsd4_callback *cb,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        __be32 *p;
-        u32 op;
-        int32_t nfserr;
+        if (hdr->minorversion == 0)
+                return;
-        READ_BUF(8);
-        READ32(op);
+        encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
-        if (op != expected) {
+        encode_sessionid4(xdr, session);
-                dprintk("NFSD: decode_cb_op_hdr: Callback server returned "
-                         " operation %d but we issued a request for %d\n",
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
-                         op, expected);
+        *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
-                return -EIO;
+        *p++ = xdr_zero;                        /* csa_slotid */
-        }
+        *p++ = xdr_zero;                        /* csa_highest_slotid */
-        READ32(nfserr);
+        *p++ = xdr_zero;                        /* csa_cachethis */
-        if (nfserr != NFS_OK)
+        xdr_encode_empty_array(p);              /* csa_referring_call_lists */
-                return -nfs_cb_stat_to_errno(nfserr);
-        return 0;
+        hdr->nops++;
 }
 /*
+ * CB_SEQUENCE4resok
+ *
+ *      struct CB_SEQUENCE4resok {
+ *              sessionid4      csr_sessionid;
+ *              sequenceid4     csr_sequenceid;
+ *              slotid4         csr_slotid;
+ *              slotid4         csr_highest_slotid;
+ *              slotid4         csr_target_highest_slotid;
+ *      };
+ *
+ *      union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
+ *      case NFS4_OK:
+ *              CB_SEQUENCE4resok       csr_resok4;
+ *      default:
+ *              void;
+ *      };
+ *
 * Our current back channel implmentation supports a single backchannel
 * with a single slot.
 */
-static int
+static int decode_cb_sequence4resok(struct xdr_stream *xdr,
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
+                                    struct nfsd4_callback *cb)
-                   struct rpc_rqst *rqstp)
 {
-        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        struct nfs4_sessionid id;
        int status;
-        u32 dummy;
        __be32 *p;
+        u32 dummy;
-        if (cb->cb_minorversion == 0)
+        status = -ESERVERFAULT;
-                return 0;
-        status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
-        if (status)
-                return status;
        /*
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        status = -ESERVERFAULT;
+        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+        if (unlikely(p == NULL))
-        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+                goto out_overflow;
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+        if (memcmp(id.data, session->se_sessionid.data,
-        if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
+                                        NFS4_MAX_SESSIONID_LEN) != 0) {
-                dprintk("%s Invalid session id\n", __func__);
+                dprintk("NFS: %s Invalid session id\n", __func__);
                goto out;
        }
-        READ32(dummy);
+        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-        if (dummy != ses->se_cb_seq_nr) {
-                dprintk("%s Invalid sequence number\n", __func__);
+        dummy = be32_to_cpup(p++);
+        if (dummy != session->se_cb_seq_nr) {
+                dprintk("NFS: %s Invalid sequence number\n", __func__);
                goto out;
        }
-        READ32(dummy);  /* slotid must be 0 */
+        dummy = be32_to_cpup(p++);
        if (dummy != 0) {
-                dprintk("%s Invalid slotid\n", __func__);
+                dprintk("NFS: %s Invalid slotid\n", __func__);
                goto out;
        }
-        /* FIXME: process highest slotid and target highest slotid */
+        /*
+         * FIXME: process highest slotid and target highest slotid
+         */
        status = 0;
 out:
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+static int decode_cb_sequence4res(struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
+{
+        enum nfsstat4 nfserr;
+        int status;
+        if (cb->cb_minorversion == 0)
+                return 0;
-static int
+        status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
-nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
+        status = decode_cb_sequence4resok(xdr, cb);
+out:
+        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
+}
+/*
+ * NFSv4.0 and NFSv4.1 XDR encode functions
+ *
+ * NFSv4.0 callback argument types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+/*
+ * NB: Without this zero space reservation, callbacks over krb5p fail
+ */
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 void *__unused)
+{
+        xdr_reserve_space(xdr, 0);
+}
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                   const struct nfsd4_callback *cb)
+{
+        const struct nfs4_delegation *args = cb->cb_op;
+        struct nfs4_cb_compound_hdr hdr = {
+                .ident = cb->cb_clp->cl_cb_ident,
+                .minorversion = cb->cb_minorversion,
+        };
+        encode_cb_compound4args(xdr, &hdr);
+        encode_cb_sequence4args(xdr, cb, &hdr);
+        encode_cb_recall4args(xdr, args, &hdr);
+        encode_cb_nops(&hdr);
+}
+/*
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+ *
+ * NFSv4.0 callback result types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                void *__unused)
 {
        return 0;
 }
-static int
+/*
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
-                struct nfsd4_callback *cb)
+ */
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
 {
-        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr;
+        enum nfsstat4 nfserr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_cb_compound4res(xdr, &hdr);
-        status = decode_cb_compound_hdr(&xdr, &hdr);
+        if (unlikely(status))
-        if (status)
                goto out;
-        if (cb) {
-                status = decode_cb_sequence(&xdr, cb, rqstp);
+        if (cb != NULL) {
-                if (status)
+                status = decode_cb_sequence4res(xdr, cb);
+                if (unlikely(status))
                        goto out;
        }
-        status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
+        status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
 out:
        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
 }
 /*
 * RPC procedure tables
 */
-#define PROC(proc, call, argtype, restype)                              \
+#define PROC(proc, call, argtype, restype)                              \
-[NFSPROC4_CLNT_##proc] = {                                              \
+[NFSPROC4_CLNT_##proc] = {                                              \
-        .p_proc   = NFSPROC4_CB_##call,                                 \
+        .p_proc    = NFSPROC4_CB_##call,                                \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
+        .p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,               \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
+        .p_decode  = (kxdrdproc_t)nfs4_xdr_dec_##restype,               \
-        .p_arglen = NFS4_##argtype##_sz,                                \
+        .p_arglen  = NFS4_enc_##argtype##_sz,                           \
-        .p_replen = NFS4_##restype##_sz,                                \
+        .p_replen  = NFS4_dec_##restype##_sz,                           \
-        .p_statidx = NFSPROC4_CB_##call,                                \
+        .p_statidx = NFSPROC4_CB_##call,                                \
-        .p_name   = #proc,                                              \
+        .p_name    = #proc,                                             \
-}
+}
-static struct rpc_procinfo     nfs4_cb_procedures[] = {
+static struct rpc_procinfo nfs4_cb_procedures[] = {
-    PROC(CB_NULL,      NULL,     enc_cb_null,     dec_cb_null),
+        PROC(CB_NULL,   NULL,           cb_null,        cb_null),
-    PROC(CB_RECALL,    COMPOUND,   enc_cb_recall,      dec_cb_recall),
+        PROC(CB_RECALL, COMPOUND,       cb_recall,      cb_recall),
 };
-static struct rpc_version       nfs_cb_version4 = {
+static struct rpc_version nfs_cb_version4 = {
 /*
 * Note on the callback rpc program version number: despite language in rfc
 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
@@ -440,29 +598,29 @@ static struct rpc_version       nfs_cb_version4 = {
 * in practice that appears to be what implementations use.  The section
 * 18.36.3 language is expected to be fixed in an erratum.
 */
-        .number                 = 1,
+        .number                 = 1,
-        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
+        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
-        .procs                  = nfs4_cb_procedures
+        .procs                  = nfs4_cb_procedures
 };
-static struct rpc_version *     nfs_cb_version[] = {
+static struct rpc_version *nfs_cb_version[] = {
        &nfs_cb_version4,
 };
 static struct rpc_program cb_program;
 static struct rpc_stat cb_stats = {
-                .program        = &cb_program
+        .program                = &cb_program
 };
 #define NFS4_CALLBACK 0x40000000
 static struct rpc_program cb_program = {
-                .name           = "nfs4_cb",
+        .name                   = "nfs4_cb",
-                .number         = NFS4_CALLBACK,
+        .number                 = NFS4_CALLBACK,
-                .nrvers         = ARRAY_SIZE(nfs_cb_version),
+        .nrvers                 = ARRAY_SIZE(nfs_cb_version),
-                .version        = nfs_cb_version,
+        .version                = nfs_cb_version,
-                .stats          = &cb_stats,
+        .stats                  = &cb_stats,
-                .pipe_dir_name  = "/nfsd4_cb",
+        .pipe_dir_name          = "/nfsd4_cb",
 };
 static int max_cb_time(void)
@@ -470,10 +628,8 @@ static int max_cb_time(void)
        return max(nfsd4_lease/10, (time_t)1) * HZ;
 }
-/* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
 {
        struct rpc_timeout      timeparms = {
                .to_initval     = max_cb_time(),
@@ -483,6 +639,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
                .net            = &init_net,
                .address        = (struct sockaddr *) &conn->cb_addr,
                .addrsize       = conn->cb_addrlen,
+                .saddress       = (struct sockaddr *) &conn->cb_saddr,
                .timeout        = &timeparms,
                .program        = &cb_program,
                .version        = 0,
@@ -499,6 +656,10 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
                args.protocol = XPRT_TRANSPORT_TCP;
                clp->cl_cb_ident = conn->cb_ident;
        } else {
+                if (!conn->cb_xprt)
+                        return -EINVAL;
+                clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+                clp->cl_cb_session = ses;
                args.bc_xprt = conn->cb_xprt;
                args.prognumber = clp->cl_cb_session->se_cb_prog;
                args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -521,14 +682,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
                (int)clp->cl_name.len, clp->cl_name.data, reason);
 }
+static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+{
+        clp->cl_cb_state = NFSD4_CB_DOWN;
+        warn_no_callback_path(clp, reason);
+}
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
        if (task->tk_status)
-                warn_no_callback_path(clp, task->tk_status);
+                nfsd4_mark_cb_down(clp, task->tk_status);
        else
-                atomic_set(&clp->cl_cb_set, 1);
+                clp->cl_cb_state = NFSD4_CB_UP;
 }
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -551,6 +718,11 @@ int set_callback_cred(void)
 static struct workqueue_struct *callback_wq;
+static void run_nfsd4_cb(struct nfsd4_callback *cb)
+{
+        queue_work(callback_wq, &cb->cb_work);
+}
 static void do_probe_callback(struct nfs4_client *clp)
 {
        struct nfsd4_callback *cb = &clp->cl_cb_null;
@@ -565,7 +737,7 @@ static void do_probe_callback(struct nfs4_client *clp)
        cb->cb_ops = &nfsd4_cb_probe_ops;
-        queue_work(callback_wq, &cb->cb_work);
+        run_nfsd4_cb(cb);
 }
 /*
@@ -574,14 +746,21 @@ static void do_probe_callback(struct nfs4_client *clp)
 */
 void nfsd4_probe_callback(struct nfs4_client *clp)
 {
+        /* XXX: atomicity?  Also, should we be using cl_cb_flags? */
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
        set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
        do_probe_callback(clp);
 }
-void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+void nfsd4_probe_callback_sync(struct nfs4_client *clp)
 {
-        BUG_ON(atomic_read(&clp->cl_cb_set));
+        nfsd4_probe_callback(clp);
+        flush_workqueue(callback_wq);
+}
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
        spin_lock(&clp->cl_lock);
        memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
        spin_unlock(&clp->cl_lock);
@@ -592,24 +771,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 * If the slot is available, then mark it busy.  Otherwise, set the
 * thread for sleeping on the callback RPC wait queue.
 */
-static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
+static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
-                struct rpc_task *task)
 {
-        u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
-        int status = 0;
-        dprintk("%s: %u:%u:%u:%u\n", __func__,
-                ptr[0], ptr[1], ptr[2], ptr[3]);
        if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
                rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
                dprintk("%s slot is busy\n", __func__);
-                status = -EAGAIN;
+                return false;
-                goto out;
        }
-out:
+        return true;
-        dprintk("%s status=%d\n", __func__, status);
-        return status;
 }
 /*
@@ -622,20 +791,19 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
        u32 minorversion = clp->cl_minorversion;
-        int status = 0;
        cb->cb_minorversion = minorversion;
        if (minorversion) {
-                status = nfsd41_cb_setup_sequence(clp, task);
+                if (!nfsd41_cb_get_slot(clp, task))
-                if (status) {
-                        if (status != -EAGAIN) {
-                                /* terminate rpc task */
-                                task->tk_status = status;
-                                task->tk_action = NULL;
-                        }
                        return;
-                }
        }
+        spin_lock(&clp->cl_lock);
+        if (list_empty(&cb->cb_per_client)) {
+                /* This is the first call, not a restart */
+                cb->cb_done = false;
+                list_add(&cb->cb_per_client, &clp->cl_callbacks);
+        }
+        spin_unlock(&clp->cl_lock);
        rpc_call_start(task);
 }
@@ -671,15 +839,18 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
        nfsd4_cb_done(task, calldata);
-        if (current_rpc_client == NULL) {
+        if (current_rpc_client != task->tk_client) {
-                /* We're shutting down; give up. */
+                /* We're shutting down or changing cl_cb_client; leave
-                /* XXX: err, or is it ok just to fall through
+                 * it to nfsd4_process_cb_update to restart the call if
-                 * and rpc_restart_call? */
+                 * necessary. */
                return;
        }
+        if (cb->cb_done)
+                return;
        switch (task->tk_status) {
        case 0:
+                cb->cb_done = true;
                return;
        case -EBADHANDLE:
        case -NFS4ERR_BAD_STATEID:
@@ -688,32 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
                break;
        default:
                /* Network partition? */
-                atomic_set(&clp->cl_cb_set, 0);
+                nfsd4_mark_cb_down(clp, task->tk_status);
-                warn_no_callback_path(clp, task->tk_status);
-                if (current_rpc_client != task->tk_client) {
-                        /* queue a callback on the new connection: */
-                        atomic_inc(&dp->dl_count);
-                        nfsd4_cb_recall(dp);
-                        return;
-                }
        }
        if (dp->dl_retries--) {
                rpc_delay(task, 2*HZ);
                task->tk_status = 0;
                rpc_restart_call_prepare(task);
                return;
-        } else {
-                atomic_set(&clp->cl_cb_set, 0);
-                warn_no_callback_path(clp, task->tk_status);
        }
+        nfsd4_mark_cb_down(clp, task->tk_status);
+        cb->cb_done = true;
 }
 static void nfsd4_cb_recall_release(void *calldata)
 {
        struct nfsd4_callback *cb = calldata;
+        struct nfs4_client *clp = cb->cb_clp;
        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-        nfs4_put_delegation(dp);
+        if (cb->cb_done) {
+                spin_lock(&clp->cl_lock);
+                list_del(&cb->cb_per_client);
+                spin_unlock(&clp->cl_lock);
+                nfs4_put_delegation(dp);
+        }
 }
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -748,16 +917,33 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
        flush_workqueue(callback_wq);
 }
-void nfsd4_release_cb(struct nfsd4_callback *cb)
+static void nfsd4_release_cb(struct nfsd4_callback *cb)
 {
        if (cb->cb_ops->rpc_release)
                cb->cb_ops->rpc_release(cb);
 }
-void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+/* requires cl_lock: */
+static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
+{
+        struct nfsd4_session *s;
+        struct nfsd4_conn *c;
+        list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
+                list_for_each_entry(c, &s->se_conns, cn_persession) {
+                        if (c->cn_flags & NFS4_CDFC4_BACK)
+                                return c;
+                }
+        }
+        return NULL;
+}
+static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 {
        struct nfs4_cb_conn conn;
        struct nfs4_client *clp = cb->cb_clp;
+        struct nfsd4_session *ses = NULL;
+        struct nfsd4_conn *c;
        int err;
        /*
@@ -768,6 +954,10 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
                rpc_shutdown_client(clp->cl_cb_client);
                clp->cl_cb_client = NULL;
        }
+        if (clp->cl_cb_conn.cb_xprt) {
+                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+                clp->cl_cb_conn.cb_xprt = NULL;
+        }
        if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
                return;
        spin_lock(&clp->cl_lock);
@@ -778,11 +968,22 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
        BUG_ON(!clp->cl_cb_flags);
        clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
        memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+        c = __nfsd4_find_backchannel(clp);
+        if (c) {
+                svc_xprt_get(c->cn_xprt);
+                conn.cb_xprt = c->cn_xprt;
+                ses = c->cn_session;
+        }
        spin_unlock(&clp->cl_lock);
-        err = setup_callback_client(clp, &conn);
+        err = setup_callback_client(clp, &conn, ses);
-        if (err)
+        if (err) {
                warn_no_callback_path(clp, err);
+                return;
+        }
+        /* Yay, the callback channel's back! Restart any callbacks: */
+        list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
+                run_nfsd4_cb(cb);
 }
 void nfsd4_do_callback_rpc(struct work_struct *w)
@@ -807,10 +1008,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
        struct nfsd4_callback *cb = &dp->dl_recall;
+        struct nfs4_client *clp = dp->dl_client;
        dp->dl_retries = 1;
        cb->cb_op = dp;
-        cb->cb_clp = dp->dl_client;
+        cb->cb_clp = clp;
        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
        cb->cb_msg.rpc_argp = cb;
        cb->cb_msg.rpc_resp = cb;
@@ -819,5 +1021,8 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
        cb->cb_ops = &nfsd4_cb_recall_ops;
        dp->dl_retries = 1;
-        queue_work(callback_wq, &dp->dl_recall.cb_work);
+        INIT_LIST_HEAD(&cb->cb_per_client);
+        cb->cb_done = true;
+        run_nfsd4_cb(&dp->dl_recall);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f0695e815f0e..6d2c397d458b 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,11 @@
 */
 #include <linux/module.h>
-#include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include "idmap.h"
+#include "nfsd.h"
 /*
 * Cache entry
@@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
        return clp->name;
 }
-static int
+static __be32
 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
                uid_t *id)
 {
@@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
        int ret;
        if (namelen + 1 > sizeof(key.name))
-                return -EINVAL;
+                return nfserr_badowner;
        memcpy(key.name, name, namelen);
        key.name[namelen] = '\0';
        strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
        ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
        if (ret == -ENOENT)
-                ret = -ESRCH; /* nfserr_badname */
+                return nfserr_badowner;
        if (ret)
-                return ret;
+                return nfserrno(ret);
        *id = item->id;
        cache_put(&item->h, &nametoid_cache);
        return 0;
@@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
        return ret;
 }
-int
+__be32
 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
                __u32 *id)
 {
        return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
 }
-int
+__be32
 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
                __u32 *id)
 {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0cdfd022bb7b..db52546143d1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        return status;
 }
-static __be32
+static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
-nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
-              void *arg)
 {
        struct svc_fh tmp_fh;
        __be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        ret = exp_pseudoroot(rqstp, &tmp_fh);
        if (ret)
                return ret;
-        if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
+        if (tmp_fh.fh_dentry == fh->fh_dentry) {
                fh_put(&tmp_fh);
                return nfserr_noent;
        }
        fh_put(&tmp_fh);
-        return nfsd_lookup(rqstp, &cstate->current_fh,
+        return nfsd_lookup(rqstp, fh, "..", 2, fh);
-                           "..", 2, &cstate->current_fh);
+}
+static __be32
+nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+              void *arg)
+{
+        return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
 }
 static __be32
@@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        } else
                secinfo->si_exp = exp;
        dput(dentry);
+        if (cstate->minorversion)
+                /* See rfc 5661 section 2.6.3.1.1.8 */
+                fh_put(&cstate->current_fh);
        return err;
 }
 static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+              struct nfsd4_secinfo_no_name *sin)
+{
+        __be32 err;
+        switch (sin->sin_style) {
+        case NFS4_SECINFO_STYLE4_CURRENT_FH:
+                break;
+        case NFS4_SECINFO_STYLE4_PARENT:
+                err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+                if (err)
+                        return err;
+                break;
+        default:
+                return nfserr_inval;
+        }
+        exp_get(cstate->current_fh.fh_export);
+        sin->sin_exp = cstate->current_fh.fh_export;
+        fh_put(&cstate->current_fh);
+        return nfs_ok;
+}
+static __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
              struct nfsd4_setattr *setattr)
 {
@@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
 * Also note, enforced elsewhere:
 *      - SEQUENCE other than as first op results in
 *        NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
- *      - BIND_CONN_TO_SESSION must be the only op in its compound
+ *      - BIND_CONN_TO_SESSION must be the only op in its compound.
- *        (Will be enforced in nfsd4_bind_conn_to_session().)
+ *        (Enforced in nfsd4_bind_conn_to_session().)
 *      - DESTROY_SESSION must be the final operation in a compound, if
 *        sessionid's in SEQUENCE and DESTROY_SESSION are the same.
 *        (Enforced in nfsd4_destroy_session().)
@@ -1126,10 +1156,6 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
-        if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
-                dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
-                status = nfserr_jukebox;
-        }
        resp->cstate.status = status;
        fh_put(&resp->cstate.current_fh);
@@ -1300,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
                .op_name = "OP_EXCHANGE_ID",
        },
+        [OP_BIND_CONN_TO_SESSION] = {
+                .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_name = "OP_BIND_CONN_TO_SESSION",
+        },
        [OP_CREATE_SESSION] = {
                .op_func = (nfsd4op_func)nfsd4_create_session,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
@@ -1320,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_flags = ALLOWED_WITHOUT_FH,
                .op_name = "OP_RECLAIM_COMPLETE",
        },
+        [OP_SECINFO_NO_NAME] = {
+                .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+                .op_name = "OP_SECINFO_NO_NAME",
+        },
 };
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a26..ffb59ef6f82f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
        int status;
-        /* note: we currently use this path only for minorversion 0 */
        if (nfs4_has_reclaimed_state(child->d_name.name, false))
                return 0;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fbd18c3074bb..d98d0213285d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_client = clp;
        get_nfs4_file(fp);
        dp->dl_file = fp;
-        nfs4_file_get_access(fp, O_RDONLY);
+        dp->dl_vfs_file = find_readable_file(fp);
+        get_file(dp->dl_vfs_file);
        dp->dl_flock = NULL;
        dp->dl_type = type;
        dp->dl_stateid.si_boot = boot_time;
@@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
        if (atomic_dec_and_test(&dp->dl_count)) {
                dprintk("NFSD: freeing dp %p\n",dp);
                put_nfs4_file(dp->dl_file);
+                fput(dp->dl_vfs_file);
                kmem_cache_free(deleg_slab, dp);
                num_delegations--;
        }
@@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 static void
 nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-        struct file *filp = find_readable_file(dp->dl_file);
        dprintk("NFSD: close_delegation dp %p\n",dp);
+        /* XXX: do we even need this check?: */
        if (dp->dl_flock)
-                vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
+                vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
-        nfs4_file_put_access(dp->dl_file, O_RDONLY);
 }
 /* Called under the state lock. */
@@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
                free_conn(c);
        }
        spin_unlock(&clp->cl_lock);
+        nfsd4_probe_callback(clp);
 }
 static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -679,15 +680,12 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
        return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
-static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
 {
        struct nfsd4_conn *conn;
-        u32 flags = NFS4_CDFC4_FORE;
        int ret;
-        if (ses->se_flags & SESSION4_BACK_CHAN)
+        conn = alloc_conn(rqstp, dir);
-                flags |= NFS4_CDFC4_BACK;
-        conn = alloc_conn(rqstp, flags);
        if (!conn)
                return nfserr_jukebox;
        nfsd4_hash_conn(conn, ses);
@@ -698,6 +696,17 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
        return nfs_ok;
 }
+static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+        u32 dir = NFS4_CDFC4_FORE;
+        if (ses->se_flags & SESSION4_BACK_CHAN)
+                dir |= NFS4_CDFC4_BACK;
+        return nfsd4_new_conn(rqstp, ses, dir);
+}
+/* must be called under client_lock */
 static void nfsd4_del_conns(struct nfsd4_session *s)
 {
        struct nfs4_client *clp = s->se_client;
@@ -749,6 +758,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
         */
        slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
        numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+        if (numslots < 1)
+                return NULL;
        new = alloc_session(slotsize, numslots);
        if (!new) {
@@ -769,25 +780,30 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
        idx = hash_sessionid(&new->se_sessionid);
        spin_lock(&client_lock);
        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+        spin_lock(&clp->cl_lock);
        list_add(&new->se_perclnt, &clp->cl_sessions);
+        spin_unlock(&clp->cl_lock);
        spin_unlock(&client_lock);
-        status = nfsd4_new_conn(rqstp, new);
+        status = nfsd4_new_conn_from_crses(rqstp, new);
        /* whoops: benny points out, status is ignored! (err, or bogus) */
        if (status) {
                free_session(&new->se_ref);
                return NULL;
        }
-        if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
+        if (cses->flags & SESSION4_BACK_CHAN) {
                struct sockaddr *sa = svc_addr(rqstp);
+                /*
-                clp->cl_cb_session = new;
+                 * This is a little silly; with sessions there's no real
-                clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
+                 * use for the callback address.  Use the peer address
-                svc_xprt_get(rqstp->rq_xprt);
+                 * as a reasonable default for now, but consider fixing
+                 * the rpc client not to require an address in the
+                 * future:
+                 */
                rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
                clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-                nfsd4_probe_callback(clp);
        }
+        nfsd4_probe_callback(clp);
        return new;
 }
@@ -817,7 +833,9 @@ static void
 unhash_session(struct nfsd4_session *ses)
 {
        list_del(&ses->se_hash);
+        spin_lock(&ses->se_client->cl_lock);
        list_del(&ses->se_perclnt);
+        spin_unlock(&ses->se_client->cl_lock);
 }
 /* must be called under the client_lock */
@@ -923,8 +941,10 @@ unhash_client_locked(struct nfs4_client *clp)
        mark_client_expired(clp);
        list_del(&clp->cl_lru);
+        spin_lock(&clp->cl_lock);
        list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
                list_del_init(&ses->se_hash);
+        spin_unlock(&clp->cl_lock);
 }
 static void
@@ -1051,12 +1071,13 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_refcount, 0);
-        atomic_set(&clp->cl_cb_set, 0);
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
        INIT_LIST_HEAD(&clp->cl_idhash);
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
        INIT_LIST_HEAD(&clp->cl_lru);
+        INIT_LIST_HEAD(&clp->cl_callbacks);
        spin_lock_init(&clp->cl_lock);
        INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
        clp->cl_time = get_seconds();
@@ -1132,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid)
        return NULL;
 }
-/*
+static bool clp_used_exchangeid(struct nfs4_client *clp)
- * Return 1 iff clp's clientid establishment method matches the use_exchange_id
- * parameter. Matching is based on the fact the at least one of the
- * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
- *
- * FIXME: we need to unify the clientid namespaces for nfsv4.x
- * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
- * and SET_CLIENTID{,_CONFIRM}
- */
-static inline int
-match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
 {
-        bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+        return clp->cl_exchange_flags != 0;
-        return use_exchange_id == has_exchange_flags;
+} 
-}
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+find_confirmed_client_by_str(const char *dname, unsigned int hashval)
-                             bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname) &&
+                if (same_name(clp->cl_recdir, dname))
-                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
 }
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
-                               bool use_exchange_id)
 {
        struct nfs4_client *clp;
        list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-                if (same_name(clp->cl_recdir, dname) &&
+                if (same_name(clp->cl_recdir, dname))
-                    match_clientid_establishment(clp, use_exchange_id))
                        return clp;
        }
        return NULL;
 }
+static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
+{
+        switch (family) {
+        case AF_INET:
+                ((struct sockaddr_in *)sa)->sin_family = AF_INET;
+                ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
+                return;
+        case AF_INET6:
+                ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
+                ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
+                return;
+        }
+}
 static void
-gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
 {
        struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
+        struct sockaddr *sa = svc_addr(rqstp);
+        u32 scopeid = rpc_get_scope_id(sa);
        unsigned short expected_family;
        /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1205,6 +1227,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
        conn->cb_prog = se->se_callback_prog;
        conn->cb_ident = se->se_callback_ident;
+        rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
        return;
 out_err:
        conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1344,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        case SP4_NONE:
                break;
        case SP4_SSV:
-                return nfserr_encr_alg_unsupp;
+                return nfserr_serverfault;
        default:
                BUG();                          /* checked by xdr code */
        case SP4_MACH_CRED:
@@ -1361,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        nfs4_lock_state();
        status = nfs_ok;
-        conf = find_confirmed_client_by_str(dname, strhashval, true);
+        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
+                if (!clp_used_exchangeid(conf)) {
+                        status = nfserr_clid_inuse; /* XXX: ? */
+                        goto out;
+                }
                if (!same_verf(&verf, &conf->cl_verifier)) {
                        /* 18.35.4 case 8 */
                        if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1403,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                goto out;
        }
-        unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+        unconf  = find_unconfirmed_client_by_str(dname, strhashval);
        if (unconf) {
                /*
                 * Possible retry or client restart.  Per 18.35.4 case 4,
@@ -1560,6 +1587,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        status = nfs_ok;
        memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
               NFS4_MAX_SESSIONID_LEN);
+        memcpy(&cr_ses->fore_channel, &new->se_fchannel,
+                sizeof(struct nfsd4_channel_attrs));
        cs_slot->sl_seqid++;
        cr_ses->seqid = cs_slot->sl_seqid;
@@ -1581,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
        return argp->opcnt == resp->opcnt;
 }
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+        switch (*dir) {
+        case NFS4_CDFC4_FORE:
+        case NFS4_CDFC4_BACK:
+                return nfs_ok;
+        case NFS4_CDFC4_FORE_OR_BOTH:
+        case NFS4_CDFC4_BACK_OR_BOTH:
+                *dir = NFS4_CDFC4_BOTH;
+                return nfs_ok;
+        };
+        return nfserr_inval;
+}
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+                     struct nfsd4_compound_state *cstate,
+                     struct nfsd4_bind_conn_to_session *bcts)
+{
+        __be32 status;
+        if (!nfsd4_last_compound_op(rqstp))
+                return nfserr_not_only_op;
+        spin_lock(&client_lock);
+        cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+        /* Sorta weird: we only need the refcnt'ing because new_conn acquires
+         * client_lock iself: */
+        if (cstate->session) {
+                nfsd4_get_session(cstate->session);
+                atomic_inc(&cstate->session->se_client->cl_refcount);
+        }
+        spin_unlock(&client_lock);
+        if (!cstate->session)
+                return nfserr_badsession;
+        status = nfsd4_map_bcts_dir(&bcts->dir);
+        nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+        return nfs_ok;
+}
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
 {
        if (!session)
@@ -1619,8 +1687,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
        spin_unlock(&client_lock);
        nfs4_lock_state();
-        /* wait for callbacks */
+        nfsd4_probe_callback_sync(ses->se_client);
-        nfsd4_shutdown_callback(ses->se_client);
        nfs4_unlock_state();
        nfsd4_del_conns(ses);
@@ -1733,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 out:
        /* Hold a session reference until done processing the compound. */
        if (cstate->session) {
+                struct nfs4_client *clp = session->se_client;
                nfsd4_get_session(cstate->session);
-                atomic_inc(&session->se_client->cl_refcount);
+                atomic_inc(&clp->cl_refcount);
+                if (clp->cl_cb_state == NFSD4_CB_DOWN)
+                        seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
        }
        kfree(conn);
        spin_unlock(&client_lock);
@@ -1775,7 +1846,6 @@ __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
 {
-        struct sockaddr         *sa = svc_addr(rqstp);
        struct xdr_netobj       clname = { 
                .len = setclid->se_namelen,
                .data = setclid->se_name,
@@ -1801,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        strhashval = clientstr_hashval(dname);
        nfs4_lock_state();
-        conf = find_confirmed_client_by_str(dname, strhashval, false);
+        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
                /* RFC 3530 14.2.33 CASE 0: */
                status = nfserr_clid_inuse;
+                if (clp_used_exchangeid(conf))
+                        goto out;
                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
                        char addr_str[INET6_ADDRSTRLEN];
                        rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1819,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * has a description of SETCLIENTID request processing consisting
         * of 5 bullet points, labeled as CASE0 - CASE4 below.
         */
-        unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
+        unconf = find_unconfirmed_client_by_str(dname, strhashval);
        status = nfserr_resource;
        if (!conf) {
                /*
@@ -1876,7 +1948,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * for consistent minorversion use throughout:
         */
        new->cl_minorversion = 0;
-        gen_callback(new, setclid, rpc_get_scope_id(sa));
+        gen_callback(new, setclid, rqstp);
        add_to_unconfirmed(new, strhashval);
        setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
        setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1935,7 +2007,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
                        status = nfserr_clid_inuse;
                else {
-                        atomic_set(&conf->cl_cb_set, 0);
                        nfsd4_change_callback(conf, &unconf->cl_cb_conn);
                        nfsd4_probe_callback(conf);
                        expire_client(unconf);
@@ -1964,7 +2035,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        unsigned int hash =
                                clientstr_hashval(unconf->cl_recdir);
                        conf = find_confirmed_client_by_str(unconf->cl_recdir,
-                                                            hash, false);
+                                                            hash);
                        if (conf) {
                                nfsd4_remove_clid_dir(conf);
                                expire_client(conf);
@@ -2300,41 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
        nfsd4_cb_recall(dp);
 }
-/*
- * The file_lock is being reapd.
- *
- * Called by locks_free_lock() with lock_flocks() held.
- */
-static
-void nfsd_release_deleg_cb(struct file_lock *fl)
-{
-        struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-        dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
-        if (!(fl->fl_flags & FL_LEASE) || !dp)
-                return;
-        dp->dl_flock = NULL;
-}
-/*
- * Called from setlease() with lock_flocks() held
- */
-static
-int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
-{
-        struct nfs4_delegation *onlistd =
-                (struct nfs4_delegation *)onlist->fl_owner;
-        struct nfs4_delegation *tryd =
-                (struct nfs4_delegation *)try->fl_owner;
-        if (onlist->fl_lmops != try->fl_lmops)
-                return 0;
-        return onlistd->dl_client == tryd->dl_client;
-}
 static
 int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 {
@@ -2346,8 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
        .fl_break = nfsd_break_deleg_cb,
-        .fl_release_private = nfsd_release_deleg_cb,
-        .fl_mylease = nfsd_same_client_deleg_cb,
        .fl_change = nfsd_change_deleg_cb,
 };
@@ -2514,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
        if (!fp->fi_fds[oflag]) {
                status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
                        &fp->fi_fds[oflag]);
-                if (status == nfserr_dropit)
-                        status = nfserr_jukebox;
                if (status)
                        return status;
        }
@@ -2596,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
        open->op_stateowner->so_client->cl_firststate = 1;
 }
+/* Should we give out recallable state?: */
+static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
+{
+        if (clp->cl_cb_state == NFSD4_CB_UP)
+                return true;
+        /*
+         * In the sessions case, since we don't have to establish a
+         * separate connection for callbacks, we assume it's OK
+         * until we hear otherwise:
+         */
+        return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
+}
 /*
 * Attempt to hand out a delegation.
 */
@@ -2604,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
-        int cb_up = atomic_read(&sop->so_client->cl_cb_set);
+        int cb_up;
        struct file_lock *fl;
        int status, flag = 0;
+        cb_up = nfsd4_cb_channel_good(sop->so_client);
        flag = NFS4_OPEN_DELEGATE_NONE;
        open->op_recall = 0;
        switch (open->op_claim_type) {
@@ -2655,7 +2701,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        dp->dl_flock = fl;
        /* vfs_setlease checks to see if delegation should be handed out.
-         * the lock_manager callbacks fl_mylease and fl_change are used
+         * the lock_manager callback fl_change is used
         */
        if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
@@ -2794,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        renew_client(clp);
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
-                        && !atomic_read(&clp->cl_cb_set))
+                        && clp->cl_cb_state != NFSD4_CB_UP)
                goto out;
        status = nfs_ok;
 out:
@@ -3081,9 +3127,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                renew_client(dp->dl_client);
-                if (filpp)
+                if (filpp) {
                        *filpp = find_readable_file(dp->dl_file);
-                BUG_ON(!*filpp);
+                        BUG_ON(!*filpp);
+                }
        } else { /* open or lock stateid */
                stp = find_stateid(stateid, flags);
                if (!stp)
@@ -4107,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
        unsigned int strhashval = clientstr_hashval(name);
        struct nfs4_client *clp;
-        clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
+        clp = find_confirmed_client_by_str(name, strhashval);
        return clp ? 1 : 0;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f35a94a04026..956629b9cdc9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
-#include <linux/nfsd_idmap.h>
-#include <linux/nfs4_acl.h>
 #include <linux/sunrpc/svcauth_gss.h>
+#include "idmap.h"
+#include "acl.h"
 #include "xdr4.h"
 #include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
 /*
@@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                        len += XDR_QUADLEN(dummy32) << 2;
                        READMEM(buf, dummy32);
                        ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
-                        host_err = 0;
+                        status = nfs_ok;
                        if (ace->whotype != NFS4_ACL_WHO_NAMED)
                                ace->who = 0;
                        else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
-                                host_err = nfsd_map_name_to_gid(argp->rqstp,
+                                status = nfsd_map_name_to_gid(argp->rqstp,
                                                buf, dummy32, &ace->who);
                        else
-                                host_err = nfsd_map_name_to_uid(argp->rqstp,
+                                status = nfsd_map_name_to_uid(argp->rqstp,
                                                buf, dummy32, &ace->who);
-                        if (host_err)
+                        if (status)
-                                goto out_nfserr;
+                                return status;
                }
        } else
                *acl = NULL;
@@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
        DECODE_TAIL;
 }
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+        DECODE_HEAD;
+        u32 dummy;
+        READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+        COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        READ32(bcts->dir);
+        /* XXX: Perhaps Tom Tucker could help us figure out how we
+         * should be using ctsa_use_conn_in_rdma_mode: */
+        READ32(dummy);
+        DECODE_TAIL;
+}
 static __be32
 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
@@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 }
 static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+                     struct nfsd4_secinfo_no_name *sin)
+{
+        DECODE_HEAD;
+        READ_BUF(4);
+        READ32(sin->sin_style);
+        DECODE_TAIL;
+}
+static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
        __be32 status;
@@ -1005,7 +1032,7 @@ static __be32
 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
                         struct nfsd4_exchange_id *exid)
 {
-        int dummy;
+        int dummy, tmp;
        DECODE_HEAD;
        READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
                /* ssp_hash_algs<> */
                READ_BUF(4);
-                READ32(dummy);
+                READ32(tmp);
-                READ_BUF(dummy);
+                while (tmp--) {
-                p += XDR_QUADLEN(dummy);
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                }
                /* ssp_encr_algs<> */
                READ_BUF(4);
-                READ32(dummy);
+                READ32(tmp);
-                READ_BUF(dummy);
+                while (tmp--) {
-                p += XDR_QUADLEN(dummy);
+                        READ_BUF(4);
+                        READ32(dummy);
+                        READ_BUF(dummy);
+                        p += XDR_QUADLEN(dummy);
+                }
                /* ssp_window and ssp_num_gss_handles */
                READ_BUF(8);
@@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        /* new operations for NFSv4.1 */
        [OP_BACKCHANNEL_CTL]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
        [OP_EXCHANGE_ID]        = (nfsd4_dec)nfsd4_decode_exchange_id,
        [OP_CREATE_SESSION]     = (nfsd4_dec)nfsd4_decode_create_session,
        [OP_DESTROY_SESSION]    = (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_dec)nfsd4_decode_sequence,
        [OP_SET_SSV]            = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_TEST_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2309,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        case nfserr_resource:
                nfserr = nfserr_toosmall;
                goto fail;
-        case nfserr_dropit:
-                goto fail;
        case nfserr_noent:
                goto skip_entry;
        default:
@@ -2365,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
        return nfserr;
 }
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+        __be32 *p;
+        if (!nfserr) {
+                RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
+                WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+                WRITE32(bcts->dir);
+                /* XXX: ? */
+                WRITE32(0);
+                ADJUST_ARGS();
+        }
+        return nfserr;
+}
 static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
@@ -2826,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 }
 static __be32
-nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
-                     struct nfsd4_secinfo *secinfo)
+                         __be32 nfserr,struct svc_export *exp)
 {
        int i = 0;
-        struct svc_export *exp = secinfo->si_exp;
        u32 nflavs;
        struct exp_flavor_info *flavs;
        struct exp_flavor_info def_flavs[2];
@@ -2892,6 +2939,20 @@ out:
        return nfserr;
 }
+static __be32
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+                     struct nfsd4_secinfo *secinfo)
+{
+        return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
+}
+static __be32
+nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+                     struct nfsd4_secinfo_no_name *secinfo)
+{
+        return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
+}
 /*
 * The SETATTR encode routine is special -- it always encodes a bitmap,
 * regardless of the error status.
@@ -3076,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
        WRITE32(seq->seqid);
        WRITE32(seq->slotid);
        WRITE32(seq->maxslots);
-        /*
+        /* For now: target_maxslots = maxslots */
-         * FIXME: for now:
-         *   target_maxslots = maxslots
-         *   status_flags = 0
-         */
        WRITE32(seq->maxslots);
-        WRITE32(0);
+        WRITE32(seq->status_flags);
        ADJUST_ARGS();
        resp->cstate.datap = p; /* DRC cache data pointer */
@@ -3143,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        /* NFSv4.1 operations */
        [OP_BACKCHANNEL_CTL]    = (nfsd4_enc)nfsd4_encode_noop,
-        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
        [OP_EXCHANGE_ID]        = (nfsd4_enc)nfsd4_encode_exchange_id,
        [OP_CREATE_SESSION]     = (nfsd4_enc)nfsd4_encode_create_session,
        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_destroy_session,
@@ -3154,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_noop,
-        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_enc)nfsd4_encode_sequence,
        [OP_SET_SSV]            = (nfsd4_enc)nfsd4_encode_noop,
        [OP_TEST_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4514ebbee4d6..33b3e2b06779 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
 #include <linux/namei.h>
 #include <linux/ctype.h>
-#include <linux/nfsd_idmap.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
+#include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
@@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
+#ifdef CONFIG_NFSD_DEPRECATED
        static int warned;
        if (file->f_dentry->d_name.name[0] == '.' && !warned) {
                printk(KERN_INFO
@@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
                       current->comm, file->f_dentry->d_name.name);
                warned = 1;
        }
+#endif
        if (! file->private_data) {
                /* An attempt to read a transaction file without writing
                 * causes a 0-byte write so that the file can return
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6b641cf2c19a..7ecfa2420307 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void		nfsd_lockd_shutdown(void);
 #define nfserr_attrnotsupp      cpu_to_be32(NFSERR_ATTRNOTSUPP)
 #define nfserr_bad_xdr          cpu_to_be32(NFSERR_BAD_XDR)
 #define nfserr_openmode         cpu_to_be32(NFSERR_OPENMODE)
+#define nfserr_badowner         cpu_to_be32(NFSERR_BADOWNER)
 #define nfserr_locks_held       cpu_to_be32(NFSERR_LOCKS_HELD)
 #define nfserr_op_illegal       cpu_to_be32(NFSERR_OP_ILLEGAL)
 #define nfserr_grace            cpu_to_be32(NFSERR_GRACE)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784b..e15dc45fc5ec 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,9 +735,9 @@ nfserrno (int errno)
                { nfserr_stale, -ESTALE },
                { nfserr_jukebox, -ETIMEDOUT },
                { nfserr_jukebox, -ERESTARTSYS },
-                { nfserr_dropit, -EAGAIN },
+                { nfserr_jukebox, -EAGAIN },
-                { nfserr_dropit, -ENOMEM },
+                { nfserr_jukebox, -EWOULDBLOCK },
-                { nfserr_badname, -ESRCH },
+                { nfserr_jukebox, -ENOMEM },
                { nfserr_io, -ETXTBSY },
                { nfserr_notsupp, -EOPNOTSUPP },
                { nfserr_toosmall, -ETOOSMALL },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2bae1d86f5f2..18743c4d8bca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
        /* Now call the procedure handler, and encode NFS status. */
        nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
        nfserr = map_new_errors(rqstp->rq_vers, nfserr);
-        if (nfserr == nfserr_dropit) {
+        if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
                dprintk("nfsd: Dropping request; may be revisited later\n");
                nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
                return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 39adc27b0685..3074656ba7bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,10 +68,12 @@ typedef struct {
 struct nfsd4_callback {
        void *cb_op;
        struct nfs4_client *cb_clp;
+        struct list_head cb_per_client;
        u32 cb_minorversion;
        struct rpc_message cb_msg;
        const struct rpc_call_ops *cb_ops;
        struct work_struct cb_work;
+        bool cb_done;
 };
 struct nfs4_delegation {
@@ -81,6 +83,7 @@ struct nfs4_delegation {
        atomic_t                dl_count;       /* ref count */
        struct nfs4_client      *dl_client;
        struct nfs4_file        *dl_file;
+        struct file             *dl_vfs_file;
        struct file_lock        *dl_flock;
        u32                     dl_type;
        time_t                  dl_time;
@@ -95,6 +98,7 @@ struct nfs4_delegation {
 struct nfs4_cb_conn {
        /* SETCLIENTID info */
        struct sockaddr_storage cb_addr;
+        struct sockaddr_storage cb_saddr;
        size_t                  cb_addrlen;
        u32                     cb_prog; /* used only in 4.0 case;
                                            per-session otherwise */
@@ -146,6 +150,11 @@ struct nfsd4_create_session {
        u32                             gid;
 };
+struct nfsd4_bind_conn_to_session {
+        struct nfs4_sessionid           sessionid;
+        u32                             dir;
+};
 /* The single slot clientid cache structure */
 struct nfsd4_clid_slot {
        u32                             sl_seqid;
@@ -235,9 +244,13 @@ struct nfs4_client {
        unsigned long           cl_cb_flags;
        struct rpc_clnt         *cl_cb_client;
        u32                     cl_cb_ident;
-        atomic_t                cl_cb_set;
+#define NFSD4_CB_UP             0
+#define NFSD4_CB_UNKNOWN        1
+#define NFSD4_CB_DOWN           2
+        int                     cl_cb_state;
        struct nfsd4_callback   cl_cb_null;
        struct nfsd4_session    *cl_cb_session;
+        struct list_head        cl_callbacks; /* list of in-progress callbacks */
        /* for all client information that callback code might need: */
        spinlock_t              cl_lock;
@@ -454,6 +467,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3a359023c9f7..641117f2188d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
-#define MSNFS   /* HACK HACK */
 /*
 * File operations used by nfsd. Some of these have been ripped from
 * other parts of the kernel because they weren't exported, others
@@ -35,8 +34,8 @@
 #endif /* CONFIG_NFSD_V3 */
 #ifdef CONFIG_NFSD_V4
-#include <linux/nfs4_acl.h>
+#include "acl.h"
-#include <linux/nfsd_idmap.h>
+#include "idmap.h"
 #endif /* CONFIG_NFSD_V4 */
 #include "nfsd.h"
@@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
                            .dentry = dget(dentry)};
        int err = 0;
-        while (d_mountpoint(path.dentry) && follow_down(&path))
+        err = follow_down(&path, false);
-                ;
+        if (err < 0)
+                goto out;
        exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
@@ -273,6 +273,13 @@ out:
        return err;
 }
+static int nfsd_break_lease(struct inode *inode)
+{
+        if (!S_ISREG(inode->i_mode))
+                return 0;
+        return break_lease(inode, O_WRONLY | O_NONBLOCK);
+}
 /*
 * Commit metadata changes to stable storage.
 */
@@ -375,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                                goto out;
                }
-                /*
-                 * If we are changing the size of the file, then
-                 * we need to break all leases.
-                 */
-                host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
-                if (host_err == -EWOULDBLOCK)
-                        host_err = -ETIMEDOUT;
-                if (host_err) /* ENOMEM or EWOULDBLOCK */
-                        goto out_nfserr;
                host_err = get_write_access(inode);
                if (host_err)
                        goto out_nfserr;
@@ -425,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        err = nfserr_notsync;
        if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
+                host_err = nfsd_break_lease(inode);
+                if (host_err)
+                        goto out_nfserr;
                fh_lock(fhp);
                host_err = notify_change(dentry, iap);
                err = nfserrno(host_err);
                fh_unlock(fhp);
@@ -752,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         */
        if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
                host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
-        if (host_err == -EWOULDBLOCK)
-                host_err = -ETIMEDOUT;
        if (host_err) /* NOMEM or WOULDBLOCK */
                goto out_nfserr;
@@ -845,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
        struct page *page = buf->page;
        size_t size;
-        int ret;
-        ret = buf->ops->confirm(pipe, buf);
-        if (unlikely(ret))
-                return ret;
        size = sd->len;
@@ -879,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
        return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
-static inline int svc_msnfs(struct svc_fh *ffhp)
-{
-#ifdef MSNFS
-        return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
-#else
-        return 0;
-#endif
-}
 static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -900,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        err = nfserr_perm;
        inode = file->f_path.dentry->d_inode;
-        if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
-                goto out;
        if (file->f_op->splice_read && rqstp->rq_splice_ok) {
                struct splice_desc sd = {
                        .len            = 0,
@@ -927,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                fsnotify_access(file);
        } else 
                err = nfserrno(host_err);
-out:
        return err;
 }
@@ -992,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        int                     stable = *stablep;
        int                     use_wgather;
-#ifdef MSNFS
-        err = nfserr_perm;
-        if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-                (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
-                goto out;
-#endif
        dentry = file->f_path.dentry;
        inode = dentry->d_inode;
        exp   = fhp->fh_export;
@@ -1050,7 +1023,6 @@ out_nfserr:
                err = 0;
        else
                err = nfserrno(host_err);
-out:
        return err;
 }
@@ -1670,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                err = nfserrno(host_err);
                goto out_dput;
        }
+        err = nfserr_noent;
+        if (!dold->d_inode)
+                goto out_drop_write;
+        host_err = nfsd_break_lease(dold->d_inode);
+        if (host_err)
+                goto out_drop_write;
        host_err = vfs_link(dold, dirp, dnew);
        if (!host_err) {
                err = nfserrno(commit_metadata(ffhp));
@@ -1681,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                else
                        err = nfserrno(host_err);
        }
+out_drop_write:
        mnt_drop_write(tfhp->fh_export->ex_path.mnt);
 out_dput:
        dput(dnew);
@@ -1755,12 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (ndentry == trap)
                goto out_dput_new;
-        if (svc_msnfs(ffhp) &&
-                ((odentry->d_count > 1) || (ndentry->d_count > 1))) {
-                        host_err = -EPERM;
-                        goto out_dput_new;
-        }
        host_err = -EXDEV;
        if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
                goto out_dput_new;
@@ -1768,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (host_err)
                goto out_dput_new;
+        host_err = nfsd_break_lease(odentry->d_inode);
+        if (host_err)
+                goto out_drop_write;
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
        if (!host_err) {
                host_err = commit_metadata(tfhp);
                if (!host_err)
                        host_err = commit_metadata(ffhp);
        }
+out_drop_write:
        mnt_drop_write(ffhp->fh_export->ex_path.mnt);
 out_dput_new:
        dput(ndentry);
 out_dput_old:
@@ -1839,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (host_err)
                goto out_nfserr;
-        if (type != S_IFDIR) { /* It's UNLINK */
+        host_err = nfsd_break_lease(rdentry->d_inode);
-#ifdef MSNFS
+        if (host_err)
-                if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+                goto out_put;
-                        (rdentry->d_count > 1)) {
+        if (type != S_IFDIR)
-                        host_err = -EPERM;
-                } else
-#endif
                host_err = vfs_unlink(dirp, rdentry);
-        } else { /* It's RMDIR */
+        else
                host_err = vfs_rmdir(dirp, rdentry);
-        }
+out_put:
        dput(rdentry);
        if (!host_err)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 60fce3dc5cb5..366401e1a536 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
        struct svc_export *si_exp;                      /* response */
 };
+struct nfsd4_secinfo_no_name {
+        u32 sin_style;                                  /* request */
+        struct svc_export *sin_exp;                     /* response */
+};
 struct nfsd4_setattr {
        stateid_t       sa_stateid;         /* request */
        u32             sa_bmval[3];        /* request */
@@ -373,8 +378,8 @@ struct nfsd4_sequence {
        u32                     cachethis;              /* request */
 #if 0
        u32                     target_maxslots;        /* response */
-        u32                     status_flags;           /* response */
 #endif /* not yet */
+        u32                     status_flags;           /* response */
 };
 struct nfsd4_destroy_session {
@@ -422,6 +427,7 @@ struct nfsd4_op {
                /* NFSv4.1 */
                struct nfsd4_exchange_id        exchange_id;
+                struct nfsd4_bind_conn_to_session bind_conn_to_session;
                struct nfsd4_create_session     create_session;
                struct nfsd4_destroy_session    destroy_session;
                struct nfsd4_sequence           sequence;
@@ -518,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
                struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
 extern __be32 nfsd4_create_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_create_session *);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 8b782b062baa..3ee67c67cc52 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -35,7 +35,20 @@
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 {
-        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+        return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
+}
+static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
+                                     const char *fname, int err)
+{
+        struct inode *inode = bmap->b_inode;
+        if (err == -EINVAL) {
+                nilfs_error(inode->i_sb, fname,
+                            "broken bmap (inode number=%lu)\n", inode->i_ino);
+                err = -EIO;
+        }
+        return err;
 }
 /**
@@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
-        if (ret < 0)
+        if (ret < 0) {
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
                goto out;
+        }
        if (NILFS_BMAP_USE_VBN(bmap)) {
                ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
                                          &blocknr);
@@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
        up_read(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_insert(bmap, key, rec);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
@@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
-        if (!ret)
-                *key = lastkey;
        up_read(&bmap->b_sem);
+        if (ret < 0)
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+        else
+                *key = lastkey;
        return ret;
 }
@@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_delete(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
@@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_truncate(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_propagate(bmap, bh);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_mark(bmap, key, level);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5115814cb745..388e9e8f5286 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -104,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
        if (pblocknr == 0) {
                pblocknr = blocknr;
                if (inode->i_ino != NILFS_DAT_INO) {
-                        struct inode *dat =
+                        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
-                                nilfs_dat_inode(NILFS_I_NILFS(inode));
                        /* blocknr is a virtual block number */
                        err = nilfs_dat_translate(dat, blocknr, &pblocknr);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index cb003c8ee1f6..9d45773b79e6 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page,
                               unsigned from, unsigned to)
 {
        struct inode *dir = mapping->host;
-        struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
        loff_t pos = page_offset(page) + from;
        unsigned len = to - from;
        unsigned nr_dirty, copied;
@@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page,
                i_size_write(dir, pos + copied);
        if (IS_DIRSYNC(dir))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
-        err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+        err = nilfs_set_file_dirty(dir, nr_dirty);
        WARN_ON(err); /* do not happen */
        unlock_page(page);
 }
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c9a30d7ff6fc..2f560c9fb808 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = {
        .truncate       = nilfs_truncate,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 /* end of file */
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 9f8a2da67f90..bfc73d3a30ed 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -149,14 +149,9 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
        }
        err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
-        if (unlikely(err)) {
+        if (unlikely(err))
-                if (err == -EINVAL)
+                nilfs_warning(sb, __func__, "unable to read inode: %lu",
-                        nilfs_error(sb, __func__, "ifile is broken");
+                              (unsigned long) ino);
-                else
-                        nilfs_warning(sb, __func__,
-                                      "unable to read inode: %lu",
-                                      (unsigned long) ino);
-        }
        return err;
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 77b48c8fab17..2fd440d8d6b8 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -58,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
        struct nilfs_inode_info *ii = NILFS_I(inode);
        __u64 blknum = 0;
        int err = 0, ret;
-        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
        unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
        down_read(&NILFS_MDT(dat)->mi_sem);
@@ -96,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                                       inode->i_ino,
                                       (unsigned long long)blkoff);
                                err = 0;
-                        } else if (err == -EINVAL) {
-                                nilfs_error(inode->i_sb, __func__,
-                                            "broken bmap (inode=%lu)\n",
-                                            inode->i_ino);
-                                err = -EIO;
                        }
                        nilfs_transaction_abort(inode->i_sb);
                        goto out;
@@ -109,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                nilfs_transaction_commit(inode->i_sb); /* never fails */
                /* Error handling should be detailed */
                set_buffer_new(bh_result);
+                set_buffer_delay(bh_result);
                map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
                                                      to proper value */
        } else if (ret == -ENOENT) {
@@ -185,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page)
        if (ret) {
                struct inode *inode = page->mapping->host;
-                struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
                unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
-                nilfs_set_file_dirty(sbi, inode, nr_dirty);
+                nilfs_set_file_dirty(inode, nr_dirty);
        }
        return ret;
 }
@@ -229,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
                                                  start + copied);
        copied = generic_write_end(file, mapping, pos, len, copied, page,
                                   fsdata);
-        nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+        nilfs_set_file_dirty(inode, nr_dirty);
        err = nilfs_transaction_commit(inode->i_sb);
        return err ? : copied;
 }
@@ -425,13 +420,12 @@ static int __nilfs_read_inode(struct super_block *sb,
                              struct nilfs_root *root, unsigned long ino,
                              struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
-        struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
        struct buffer_head *bh;
        struct nilfs_inode *raw_inode;
        int err;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
        if (unlikely(err))
                goto bad_inode;
@@ -461,7 +455,7 @@ static int __nilfs_read_inode(struct super_block *sb,
        }
        nilfs_ifile_unmap_inode(root->ifile, ino, bh);
        brelse(bh);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        nilfs_set_inode_flags(inode);
        return 0;
@@ -470,7 +464,7 @@ static int __nilfs_read_inode(struct super_block *sb,
        brelse(bh);
 bad_inode:
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        return err;
 }
@@ -629,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
                return;
- repeat:
+repeat:
        ret = nilfs_bmap_last_key(ii->i_bmap, &b);
        if (ret == -ENOENT)
                return;
@@ -646,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
                     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
                goto repeat;
- failed:
+failed:
-        if (ret == -EINVAL)
+        nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                nilfs_error(ii->vfs_inode.i_sb, __func__,
+                      "failed to truncate bmap (ino=%lu, err=%d)",
-                            "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
+                      ii->vfs_inode.i_ino, ret);
-        else
-                nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                              "failed to truncate bmap (ino=%lu, err=%d)",
-                              ii->vfs_inode.i_ino, ret);
 }
 void nilfs_truncate(struct inode *inode)
@@ -682,7 +672,7 @@ void nilfs_truncate(struct inode *inode)
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        nilfs_mark_inode_dirty(inode);
-        nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+        nilfs_set_file_dirty(inode, 0);
        nilfs_transaction_commit(sb);
        /* May construct a logical segment and may fail in sync mode.
           But truncate has no return value. */
@@ -800,9 +790,9 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
        return generic_permission(inode, mask, flags, NULL);
 }
-int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
-                           struct buffer_head **pbh)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;
@@ -843,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode)
        return ret;
 }
-int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
-                         unsigned nr_dirty)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
@@ -878,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
 int nilfs_mark_inode_dirty(struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct buffer_head *ibh;
        int err;
-        err = nilfs_load_inode_block(sbi, inode, &ibh);
+        err = nilfs_load_inode_block(inode, &ibh);
        if (unlikely(err)) {
                nilfs_warning(inode->i_sb, __func__,
                              "failed to reget inode block.\n");
@@ -924,3 +913,134 @@ void nilfs_dirty_inode(struct inode *inode)
        nilfs_mark_inode_dirty(inode);
        nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 __u64 start, __u64 len)
+{
+        struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
+        __u64 logical = 0, phys = 0, size = 0;
+        __u32 flags = 0;
+        loff_t isize;
+        sector_t blkoff, end_blkoff;
+        sector_t delalloc_blkoff;
+        unsigned long delalloc_blklen;
+        unsigned int blkbits = inode->i_blkbits;
+        int ret, n;
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        isize = i_size_read(inode);
+        blkoff = start >> blkbits;
+        end_blkoff = (start + len - 1) >> blkbits;
+        delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
+                                                        &delalloc_blkoff);
+        do {
+                __u64 blkphy;
+                unsigned int maxblocks;
+                if (delalloc_blklen && blkoff == delalloc_blkoff) {
+                        if (size) {
+                                /* End of the current extent */
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                        }
+                        if (blkoff > end_blkoff)
+                                break;
+                        flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
+                        logical = blkoff << blkbits;
+                        phys = 0;
+                        size = delalloc_blklen << blkbits;
+                        blkoff = delalloc_blkoff + delalloc_blklen;
+                        delalloc_blklen = nilfs_find_uncommitted_extent(
+                                inode, blkoff, &delalloc_blkoff);
+                        continue;
+                }
+                /*
+                 * Limit the number of blocks that we look up so as
+                 * not to get into the next delayed allocation extent.
+                 */
+                maxblocks = INT_MAX;
+                if (delalloc_blklen)
+                        maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
+                                          maxblocks);
+                blkphy = 0;
+                down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                n = nilfs_bmap_lookup_contig(
+                        NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
+                up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                if (n < 0) {
+                        int past_eof;
+                        if (unlikely(n != -ENOENT))
+                                break; /* error */
+                        /* HOLE */
+                        blkoff++;
+                        past_eof = ((blkoff << blkbits) >= isize);
+                        if (size) {
+                                /* End of the current extent */
+                                if (past_eof)
+                                        flags |= FIEMAP_EXTENT_LAST;
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                                size = 0;
+                        }
+                        if (blkoff > end_blkoff || past_eof)
+                                break;
+                } else {
+                        if (size) {
+                                if (phys && blkphy << blkbits == phys + size) {
+                                        /* The current extent goes on */
+                                        size += n << blkbits;
+                                } else {
+                                        /* Terminate the current extent */
+                                        ret = fiemap_fill_next_extent(
+                                                fieinfo, logical, phys, size,
+                                                flags);
+                                        if (ret || blkoff > end_blkoff)
+                                                break;
+                                        /* Start another extent */
+                                        flags = FIEMAP_EXTENT_MERGED;
+                                        logical = blkoff << blkbits;
+                                        phys = blkphy << blkbits;
+                                        size = n << blkbits;
+                                }
+                        } else {
+                                /* Start a new extent */
+                                flags = FIEMAP_EXTENT_MERGED;
+                                logical = blkoff << blkbits;
+                                phys = blkphy << blkbits;
+                                size = n << blkbits;
+                        }
+                        blkoff += n;
+                }
+                cond_resched();
+        } while (true);
+        /* If ret is 1 then we just hit the end of the extent array */
+        if (ret == 1)
+                ret = 0;
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b185e937a335..496738963fdb 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -233,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
+        ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -242,8 +242,7 @@ static ssize_t
 nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
                          void *buf, size_t size, size_t nmembs)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -421,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
        size_t nmembs = argv->v_nmembs;
        int ret;
-        ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+        ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
        return (ret < 0) ? ret : nmembs;
 }
@@ -430,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                                         struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -450,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                        /* skip dead block */
                        continue;
                if (bdescs[i].bd_level == 0) {
-                        ret = nilfs_mdt_mark_block_dirty(dat,
+                        ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
                                                         bdescs[i].bd_offset);
                        if (ret < 0) {
                                WARN_ON(ret == -ENOENT);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 39a5b84e2c9f..6a0e2a189f60 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -237,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
 *
 * %-ENOENT - the specified block does not exist (hole block)
 *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
- *
 * %-EROFS - Read only filesystem (for create mode)
 */
 int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
@@ -273,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
 * %-ENOMEM - Insufficient memory available.
 *
 * %-EIO - I/O error
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
 {
@@ -350,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 * %-EIO - I/O error
 *
 * %-ENOENT - the specified block does not exist (hole block)
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
 {
@@ -499,31 +493,29 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
        struct buffer_head *bh_frozen;
        struct page *page;
        int blkbits = inode->i_blkbits;
-        int ret = -ENOMEM;
        page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
        if (!page)
-                return ret;
+                return -ENOMEM;
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << blkbits, 0);
        bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
-        if (bh_frozen) {
-                if (!buffer_uptodate(bh_frozen))
+        if (!buffer_uptodate(bh_frozen))
-                        nilfs_copy_buffer(bh_frozen, bh);
+                nilfs_copy_buffer(bh_frozen, bh);
-                if (list_empty(&bh_frozen->b_assoc_buffers)) {
+        if (list_empty(&bh_frozen->b_assoc_buffers)) {
-                        list_add_tail(&bh_frozen->b_assoc_buffers,
+                list_add_tail(&bh_frozen->b_assoc_buffers,
-                                      &shadow->frozen_buffers);
+                              &shadow->frozen_buffers);
-                        set_buffer_nilfs_redirected(bh);
+                set_buffer_nilfs_redirected(bh);
-                } else {
+        } else {
-                        brelse(bh_frozen); /* already frozen */
+                brelse(bh_frozen); /* already frozen */
-                }
-                ret = 0;
        }
        unlock_page(page);
        page_cache_release(page);
-        return ret;
+        return 0;
 }
 struct buffer_head *
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 6e9557ecf161..98034271cd02 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -577,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = {
        .rename         = nilfs_rename,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 const struct inode_operations nilfs_special_inode_operations = {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 0ca98823db59..777e8fd04304 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -190,11 +190,6 @@ static inline int nilfs_doing_construction(void)
        return nilfs_test_transaction_flag(NILFS_TI_WRITER);
 }
-static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
-{
-        return nilfs->ns_dat;
-}
 /*
 * function prototype
 */
@@ -257,13 +252,13 @@ extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
 int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
-extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
-                                  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
-extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
-                                unsigned);
 extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 __u64 start, __u64 len);
 /* super.c */
 extern struct inode *nilfs_alloc_inode(struct super_block *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a6c3c2e817f8..0c432416cfef 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -491,7 +491,7 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        }
        return nc;
 }
- 
 void nilfs_mapping_init_once(struct address_space *mapping)
 {
        memset(mapping, 0, sizeof(*mapping));
@@ -546,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page)
        }
        return TestClearPageDirty(page);
 }
+/**
+ * nilfs_find_uncommitted_extent - find extent of uncommitted data
+ * @inode: inode
+ * @start_blk: start block offset (in)
+ * @blkoff: start offset of the found extent (out)
+ *
+ * This function searches an extent of buffers marked "delayed" which
+ * starts from a block offset equal to or larger than @start_blk.  If
+ * such an extent was found, this will store the start offset in
+ * @blkoff and return its length in blocks.  Otherwise, zero is
+ * returned.
+ */
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff)
+{
+        unsigned int i;
+        pgoff_t index;
+        unsigned int nblocks_in_page;
+        unsigned long length = 0;
+        sector_t b;
+        struct pagevec pvec;
+        struct page *page;
+        if (inode->i_mapping->nrpages == 0)
+                return 0;
+        index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+repeat:
+        pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
+                                        pvec.pages);
+        if (pvec.nr == 0)
+                return length;
+        if (length > 0 && pvec.pages[0]->index > index)
+                goto out;
+        b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        i = 0;
+        do {
+                page = pvec.pages[i];
+                lock_page(page);
+                if (page_has_buffers(page)) {
+                        struct buffer_head *bh, *head;
+                        bh = head = page_buffers(page);
+                        do {
+                                if (b < start_blk)
+                                        continue;
+                                if (buffer_delay(bh)) {
+                                        if (length == 0)
+                                                *blkoff = b;
+                                        length++;
+                                } else if (length > 0) {
+                                        goto out_locked;
+                                }
+                        } while (++b, bh = bh->b_this_page, bh != head);
+                } else {
+                        if (length > 0)
+                                goto out_locked;
+                        b += nblocks_in_page;
+                }
+                unlock_page(page);
+        } while (++i < pagevec_count(&pvec));
+        index = page->index + 1;
+        pagevec_release(&pvec);
+        cond_resched();
+        goto repeat;
+out_locked:
+        unlock_page(page);
+out:
+        pagevec_release(&pvec);
+        return length;
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index fb9e8a8a2038..622df27cd891 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -66,6 +66,9 @@ void nilfs_mapping_init(struct address_space *mapping,
                        struct backing_dev_info *bdi,
                        const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff);
 #define NILFS_PAGE_BUG(page, m, a...) \
        do { nilfs_page_bug(page); BUG(); } while (0)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5d2711c28da7..3dfcd3b7d389 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -535,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
                if (unlikely(err))
                        goto failed_page;
-                err = nilfs_set_file_dirty(sbi, inode, 1);
+                err = nilfs_set_file_dirty(inode, 1);
                if (unlikely(err))
                        goto failed_page;
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 35a07157b980..7a17715f215f 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -27,14 +27,6 @@
 #include <linux/types.h>
 #include <linux/fs.h>
-/*
- * Mount options
- */
-struct nilfs_mount_options {
-        unsigned long mount_opt;
-        __u64 snapshot_cno;
-};
 struct the_nilfs;
 struct nilfs_sc_info;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 687d090cea34..55ebae5c7f39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -504,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
        return err;
 }
-static int nilfs_handle_bmap_error(int err, const char *fname,
-                                   struct inode *inode, struct super_block *sb)
-{
-        if (err == -EINVAL) {
-                nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
-                            inode->i_ino);
-                err = -EIO;
-        }
-        return err;
-}
 /*
 * Callback functions that enumerate, mark, and collect dirty blocks
 */
@@ -524,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode,
                                           sizeof(struct nilfs_binfo_v));
@@ -539,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
                                   struct buffer_head *bh,
                                   struct inode *inode)
 {
-        int err;
+        return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
-                return nilfs_handle_bmap_error(err, __func__, inode,
-                                               sci->sc_super);
-        return 0;
 }
 static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
@@ -588,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
        if (!err)
@@ -776,9 +757,8 @@ static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
                ret++;
        if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
                ret++;
-        if (ret || nilfs_doing_gc())
+        if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
-                if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
+                ret++;
-                        ret++;
        return ret;
 }
@@ -814,7 +794,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
        nilfs_mdt_clear_dirty(sci->sc_root->ifile);
        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
        nilfs_mdt_clear_dirty(nilfs->ns_sufile);
-        nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+        nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
@@ -923,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
        raw_sr->sr_flags = 0;
-        nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
+        nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
                                 NILFS_SR_DAT_OFFSET(isz), 1);
        nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
                                 NILFS_SR_CPFILE_OFFSET(isz), 1);
@@ -1179,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                sci->sc_stage.scnt++;  /* Fall through */
        case NILFS_ST_DAT:
 dat_stage:
-                err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+                err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
                                              &nilfs_sc_dat_ops);
                if (unlikely(err))
                        break;
@@ -1563,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
        return 0;
 failed_bmap:
-        err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
        return err;
 }
@@ -1783,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
                                if (!err) {
                                        set_buffer_uptodate(bh);
                                        clear_buffer_dirty(bh);
+                                        clear_buffer_delay(bh);
                                        clear_buffer_nilfs_volatile(bh);
                                }
                                brelse(bh); /* for b_assoc_buffers */
@@ -1909,6 +1889,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                                    b_assoc_buffers) {
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
+                        clear_buffer_delay(bh);
                        clear_buffer_nilfs_volatile(bh);
                        clear_buffer_nilfs_redirected(bh);
                        if (bh == segbuf->sb_super_root) {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e2dcc9c733f7..0994f6a76c07 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -47,7 +47,6 @@
 #include <linux/crc32.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
-#include <linux/kobject.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
@@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function,
                 const char *fmt, ...)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -136,13 +140,17 @@ void nilfs_error(struct super_block *sb, const char *function,
 void nilfs_warning(struct super_block *sb, const char *function,
                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "NILFS warning (device %s): %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
@@ -1010,11 +1018,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
-        struct nilfs_mount_options old_opts;
+        unsigned long old_mount_opt;
        int err;
        old_sb_flags = sb->s_flags;
-        old_opts.mount_opt = sbi->s_mount_opt;
+        old_mount_opt = sbi->s_mount_opt;
        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
@@ -1083,7 +1091,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
-        sbi->s_mount_opt = old_opts.mount_opt;
+        sbi->s_mount_opt = old_mount_opt;
        return err;
 }
@@ -1155,14 +1163,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 {
        struct nilfs_super_data sd;
        struct super_block *s;
-        fmode_t mode = FMODE_READ;
+        fmode_t mode = FMODE_READ | FMODE_EXCL;
        struct dentry *root_dentry;
        int err, s_new = false;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
-        sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(sd.bdev))
                return ERR_CAST(sd.bdev);
@@ -1241,7 +1249,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
        }
        if (!s_new)
-                close_bdev_exclusive(sd.bdev, mode);
+                blkdev_put(sd.bdev, mode);
        return root_dentry;
@@ -1250,7 +1258,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 failed:
        if (!s_new)
-                close_bdev_exclusive(sd.bdev, mode);
+                blkdev_put(sd.bdev, mode);
        return ERR_PTR(err);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0254be2d73c6..ad4ac607cf57 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -329,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        printk(KERN_INFO "NILFS: recovery complete.\n");
 skip_recovery:
-        set_nilfs_loaded(nilfs);
        nilfs_clear_recovery_info(&ri);
        sbi->s_super->s_flags = s_flags;
        return 0;
@@ -651,12 +650,11 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
        unsigned long ncleansegs;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
        return 0;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 69226e14b745..fd85e4c05c6b 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -36,8 +36,6 @@
 /* the_nilfs struct */
 enum {
        THE_NILFS_INIT = 0,     /* Information from super_block is set */
-        THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
-                                   the latest checkpoint was loaded */
        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
        THE_NILFS_GC_RUNNING,   /* gc process is running */
        THE_NILFS_SB_DIRTY,     /* super block is dirty */
@@ -178,7 +176,6 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 }
 THE_NILFS_FNS(INIT, init)
-THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 3ac36b7bf6b9..7dceff005a67 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -6,7 +6,7 @@ config FANOTIFY
        ---help---
           Say Y here to enable fanotify suport.  fanotify is a file access
           notification system which differs from inotify in that it sends
-           and open file descriptor to the userspace listener along with
+           an open file descriptor to the userspace listener along with
           the event.
           If unsure, say Y.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be992544..4ff028fcfd6e 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
             index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
             unistr.o upcase.o
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a4..f4b1057abdd2 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
 * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
 * single-segment behaviour.
 *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- * when atomic and when not atomic.  This is ok because
+ * atomic and when not atomic.  This is ok because it calls
- * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- * and it is ok to call this when non-atomic.
+ * fact, the only difference between __copy_from_user_inatomic() and
- * Infact, the only difference between __copy_from_user_inatomic() and
 * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error.  And on many
+ * should not zero the tail of the buffer on error.  And on many architectures
- * architectures __copy_from_user_inatomic() is just defined to
+ * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- * __copy_from_user() so it makes no difference at all on those architectures.
+ * makes no difference at all on those architectures.
 */
 static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                if (unlikely(copied != len)) {
                        /* Do it the slow way. */
                        addr = kmap(*pages);
-                        copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
+                        copied = __ntfs_copy_from_user_iovec_inatomic(addr +
-                                        *iov, *iov_ofs, len);
+                                        ofs, *iov, *iov_ofs, len);
-                        /*
-                         * Zero the rest of the target like __copy_from_user().
-                         */
-                        memset(addr + ofs + copied, 0, len - copied);
-                        kunmap(*pages);
                        if (unlikely(copied != len))
                                goto err_out;
+                        kunmap(*pages);
                }
                total += len;
+                ntfs_set_next_iovec(iov, iov_ofs, len);
                bytes -= len;
                if (!bytes)
                        break;
-                ntfs_set_next_iovec(iov, iov_ofs, len);
                ofs = 0;
        } while (++pages < last_page);
 out:
        return total;
 err_out:
-        total += copied;
+        BUG_ON(copied > len);
        /* Zero the rest of the target like __copy_from_user(). */
+        memset(addr + ofs + copied, 0, len - copied);
+        kunmap(*pages);
+        total += copied;
+        ntfs_set_next_iovec(iov, iov_ofs, copied);
        while (++pages < last_page) {
                bytes -= len;
                if (!bytes)
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a30ecacc01f2..29099a07b9fe 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
 /*
 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 * Copyright (c) 2001,2002 Richard Russon
 *
 * This program/include file is free software; you can redistribute it and/or
@@ -3193,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
        ntfs_sysctl(0);
 }
-MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698e..77a8de5f7119 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
 config OCFS2_FS
        tristate "OCFS2 file system support"
-        depends on NET && SYSFS
+        depends on NET && SYSFS && CONFIGFS_FS
-        select CONFIGFS_FS
        select JBD2
        select CRC32
        select QUOTA
@@ -51,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
 config OCFS2_FS_STATS
        bool "OCFS2 statistics"
-        depends on OCFS2_FS
+        depends on OCFS2_FS && DEBUG_FS
        default y
        help
          This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5007d1..e4984e259cb6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                         struct ocfs2_extent_block *eb);
 static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, tl_bh);
+        osb->truncated_clusters += num_clusters;
 bail:
        mlog_exit(status);
        return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                i--;
        }
+        osb->truncated_clusters = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -7139,64 +7141,6 @@ bail:
 }
 /*
- * Expects the inode to already be locked.
- */
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc)
-{
-        int status;
-        unsigned int new_i_clusters;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
-        struct buffer_head *last_eb_bh = NULL;
-        mlog_entry_void();
-        *tc = NULL;
-        new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
-                                                  i_size_read(inode));
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-             "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
-             (unsigned long long)le64_to_cpu(fe->i_size));
-        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
-        if (!(*tc)) {
-                status = -ENOMEM;
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_extent_block(INODE_CACHE(inode),
-                                                 le64_to_cpu(fe->i_last_eb_blk),
-                                                 &last_eb_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        }
-        (*tc)->tc_last_eb_bh = last_eb_bh;
-        status = 0;
-bail:
-        if (status < 0) {
-                if (*tc)
-                        ocfs2_free_truncate_context(*tc);
-                *tc = NULL;
-        }
-        mlog_exit_void();
-        return status;
-}
-/*
 * 'start' is inclusive, 'end' is not.
 */
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
 out:
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
-{
-        /*
-         * The caller is responsible for completing deallocation
-         * before freeing the context.
-         */
-        if (tc->tc_dealloc.c_first_suballocator != NULL)
-                mlog(ML_NOTICE,
-                     "Truncate completion has non-empty dealloc context\n");
-        brelse(tc->tc_last_eb_bh);
-        kfree(tc);
-}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b554b99..3bd08a03251c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
                                  u64 range_start, u64 range_end);
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
                          struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0d7c5540ad66..1fbb0e20131b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1630,6 +1630,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
        return ret;
 }
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+                                          unsigned int needed)
+{
+        tid_t target;
+        int ret = 0;
+        unsigned int truncated_clusters;
+        mutex_lock(&osb->osb_tl_inode->i_mutex);
+        truncated_clusters = osb->truncated_clusters;
+        mutex_unlock(&osb->osb_tl_inode->i_mutex);
+        /*
+         * Check whether we can succeed in allocating if we free
+         * the truncate log.
+         */
+        if (truncated_clusters < needed)
+                goto out;
+        ret = ocfs2_flush_truncate_log(osb);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+                jbd2_log_wait_commit(osb->journal->j_journal, target);
+                ret = 1;
+        }
+out:
+        return ret;
+}
 int ocfs2_write_begin_nolock(struct file *filp,
                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
@@ -1637,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
        int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
-        unsigned int clusters_to_alloc, extents_to_split;
+        unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
        struct ocfs2_write_ctxt *wc;
        struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1646,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct file *filp,
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+        int try_free = 1, ret1;
+try_again:
        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
        if (ret) {
                mlog_errno(ret);
@@ -1681,6 +1720,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                mlog_errno(ret);
                goto out;
        } else if (ret == 1) {
+                clusters_need = wc->w_clen;
                ret = ocfs2_refcount_cow(inode, filp, di_bh,
                                         wc->w_cpos, wc->w_clen, UINT_MAX);
                if (ret) {
@@ -1695,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                mlog_errno(ret);
                goto out;
        }
+        clusters_need += clusters_to_alloc;
        di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
@@ -1817,6 +1858,22 @@ out:
                ocfs2_free_alloc_context(data_ac);
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        if (ret == -ENOSPC && try_free) {
+                /*
+                 * Try to free some truncate log so that we can have enough
+                 * clusters to allocate.
+                 */
+                try_free = 0;
+                ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+                if (ret1 == 1)
+                        goto try_again;
+                if (ret1 < 0)
+                        mlog_errno(ret1);
+        }
        return ret;
 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9e3d45bcb5fd..b108e863d8f6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 #define O2HB_DB_TYPE_REGION_LIVENODES   4
 #define O2HB_DB_TYPE_REGION_NUMBER      5
 #define O2HB_DB_TYPE_REGION_ELAPSED_TIME        6
+#define O2HB_DB_TYPE_REGION_PINNED      7
 struct o2hb_debug_buf {
        int db_type;
        int db_size;
@@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_FAILEDREGIONS        "failed_regions"
 #define O2HB_DEBUG_REGION_NUMBER        "num"
 #define O2HB_DEBUG_REGION_ELAPSED_TIME  "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED        "pinned"
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
@@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
 unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF                3
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
 /* Only sets a new threshold if there are no active regions.
 *
 * No locking or otherwise interesting code is required for reading
@@ -186,7 +215,9 @@ struct o2hb_region {
        struct config_item      hr_item;
        struct list_head        hr_all_item;
-        unsigned                hr_unclean_stop:1;
+        unsigned                hr_unclean_stop:1,
+                                hr_item_pinned:1,
+                                hr_item_dropped:1;
        /* protected by the hr_callback_sem */
        struct task_struct      *hr_task;
@@ -212,9 +243,11 @@ struct o2hb_region {
        struct dentry           *hr_debug_livenodes;
        struct dentry           *hr_debug_regnum;
        struct dentry           *hr_debug_elapsed_time;
+        struct dentry           *hr_debug_pinned;
        struct o2hb_debug_buf   *hr_db_livenodes;
        struct o2hb_debug_buf   *hr_db_regnum;
        struct o2hb_debug_buf   *hr_db_elapsed_time;
+        struct o2hb_debug_buf   *hr_db_pinned;
        /* let the person setting up hb wait for it to return until it
         * has reached a 'steady' state.  This will be fixed when we have
@@ -701,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
               config_item_name(&reg->hr_item));
        set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+        /*
+         * If global heartbeat active, unpin all regions if the
+         * region count > CUT_OFF
+         */
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+                o2hb_region_unpin(NULL);
 }
 static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1041,6 +1082,9 @@ static int o2hb_thread(void *data)
        set_user_nice(current, -20);
+        /* Pin node */
+        o2nm_depend_this_node();
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
                 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -1090,6 +1134,9 @@ static int o2hb_thread(void *data)
                mlog_errno(ret);
        }
+        /* Unpin node */
+        o2nm_undepend_this_node();
        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
        return 0;
@@ -1142,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
                                                 reg->hr_last_timeout_start));
                goto done;
+        case O2HB_DB_TYPE_REGION_PINNED:
+                reg = (struct o2hb_region *)db->db_data;
+                out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+                                !!reg->hr_item_pinned);
+                goto done;
        default:
                goto done;
        }
@@ -1315,6 +1368,8 @@ int o2hb_init(void)
        memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
        memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+        o2hb_dependent_users = 0;
        return o2hb_debug_init();
 }
@@ -1384,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item)
        debugfs_remove(reg->hr_debug_livenodes);
        debugfs_remove(reg->hr_debug_regnum);
        debugfs_remove(reg->hr_debug_elapsed_time);
+        debugfs_remove(reg->hr_debug_pinned);
        debugfs_remove(reg->hr_debug_dir);
        spin_lock(&o2hb_live_lock);
@@ -1673,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                goto out;
        reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-        ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+        ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
        if (ret) {
                reg->hr_bdev = NULL;
                goto out;
@@ -1948,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                goto bail;
        }
+        reg->hr_debug_pinned =
+                        o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+                                          reg->hr_debug_dir,
+                                          &(reg->hr_db_pinned),
+                                          sizeof(*(reg->hr_db_pinned)),
+                                          O2HB_DB_TYPE_REGION_PINNED,
+                                          0, 0, reg);
+        if (!reg->hr_debug_pinned) {
+                mlog_errno(ret);
+                goto bail;
+        }
        ret = 0;
 bail:
        return ret;
@@ -2002,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 {
        struct task_struct *hb_task;
        struct o2hb_region *reg = to_o2hb_region(item);
+        int quorum_region = 0;
        /* stop the thread when the user removes the region dir */
        spin_lock(&o2hb_live_lock);
        if (o2hb_global_heartbeat_active()) {
                clear_bit(reg->hr_region_num, o2hb_region_bitmap);
                clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+                if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                        quorum_region = 1;
+                clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
        }
        hb_task = reg->hr_task;
        reg->hr_task = NULL;
+        reg->hr_item_dropped = 1;
        spin_unlock(&o2hb_live_lock);
        if (hb_task)
@@ -2028,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        if (o2hb_global_heartbeat_active())
                printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
                       config_item_name(&reg->hr_item));
        config_item_put(item);
+        if (!o2hb_global_heartbeat_active() || !quorum_region)
+                return;
+        /*
+         * If global heartbeat active and there are dependent users,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        spin_lock(&o2hb_live_lock);
+        if (!o2hb_dependent_users)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
 }
 struct o2hb_heartbeat_group_attribute {
@@ -2214,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
 {
-        struct o2hb_region *p, *reg = NULL;
+        int ret = 0, found = 0;
+        struct o2hb_region *reg;
+        char *uuid;
        assert_spin_locked(&o2hb_live_lock);
-        list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-                if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+                uuid = config_item_name(&reg->hr_item);
-                        reg = p;
-                        break;
+                /* local heartbeat */
+                if (region_uuid) {
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
+                }
+                if (reg->hr_item_pinned || reg->hr_item_dropped)
+                        goto skip_pin;
+                /* Ignore ENOENT only for local hb (userdlm domain) */
+                ret = o2nm_depend_item(&reg->hr_item);
+                if (!ret) {
+                        mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+                        reg->hr_item_pinned = 1;
+                } else {
+                        if (ret == -ENOENT && found)
+                                ret = 0;
+                        else {
+                                mlog(ML_ERROR, "Pin region %s fails with %d\n",
+                                     uuid, ret);
+                                break;
+                        }
                }
+skip_pin:
+                if (found)
+                        break;
        }
-        return reg;
+        return ret;
 }
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
 {
-        int ret = 0;
        struct o2hb_region *reg;
+        char *uuid;
+        int found = 0;
-        spin_lock(&o2hb_live_lock);
+        assert_spin_locked(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-        if (!reg)
+                uuid = config_item_name(&reg->hr_item);
-                ret = -ENOENT;
+                if (region_uuid) {
-        spin_unlock(&o2hb_live_lock);
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
+                }
-        if (ret)
+                if (reg->hr_item_pinned) {
-                goto out;
+                        mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+                        o2nm_undepend_item(&reg->hr_item);
+                        reg->hr_item_pinned = 0;
+                }
+                if (found)
+                        break;
+        }
+}
-        ret = o2nm_depend_this_node();
+static int o2hb_region_inc_user(const char *region_uuid)
-        if (ret)
+{
-                goto out;
+        int ret = 0;
-        ret = o2nm_depend_item(&reg->hr_item);
+        spin_lock(&o2hb_live_lock);
-        if (ret)
-                o2nm_undepend_this_node();
-out:
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            ret = o2hb_region_pin(region_uuid);
+            goto unlock;
+        }
+        /*
+         * if global heartbeat active and this is the first dependent user,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        o2hb_dependent_users++;
+        if (o2hb_dependent_users > 1)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                ret = o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
        return ret;
 }
-static void o2hb_region_put(const char *region_uuid)
+void o2hb_region_dec_user(const char *region_uuid)
 {
-        struct o2hb_region *reg;
        spin_lock(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            o2hb_region_unpin(region_uuid);
+            goto unlock;
+        }
-        spin_unlock(&o2hb_live_lock);
+        /*
+         * if global heartbeat active and there are no dependent users,
+         * unpin all quorum regions
+         */
+        o2hb_dependent_users--;
+        if (!o2hb_dependent_users)
+                o2hb_region_unpin(NULL);
-        if (reg) {
+unlock:
-                o2nm_undepend_item(&reg->hr_item);
+        spin_unlock(&o2hb_live_lock);
-                o2nm_undepend_this_node();
-        }
 }
 int o2hb_register_callback(const char *region_uuid,
@@ -2291,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
        }
        if (region_uuid) {
-                ret = o2hb_region_get(region_uuid);
+                ret = o2hb_region_inc_user(region_uuid);
-                if (ret)
+                if (ret) {
+                        mlog_errno(ret);
                        goto out;
+                }
        }
        down_write(&o2hb_callback_sem);
@@ -2311,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
        up_write(&o2hb_callback_sem);
        ret = 0;
 out:
-        mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
             ret, __builtin_return_address(0), hc);
        return ret;
 }
@@ -2322,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
-        mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
             __builtin_return_address(0), hc);
        /* XXX Can this happen _with_ a region reference? */
@@ -2330,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
                return;
        if (region_uuid)
-                o2hb_region_put(region_uuid);
+                o2hb_region_dec_user(region_uuid);
        down_write(&o2hb_callback_sem);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b02..3a5835904b3d 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
 #define O2NET_DEBUG_DIR         "o2net"
 #define SC_DEBUG_NAME           "sock_containers"
 #define NST_DEBUG_NAME          "send_tracking"
+#define STATS_DEBUG_NAME        "stats"
+#define SHOW_SOCK_CONTAINERS    0
+#define SHOW_SOCK_STATS         1
 static struct dentry *o2net_dentry;
 static struct dentry *sc_dentry;
 static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
 static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static int nst_seq_show(struct seq_file *seq, void *v)
 {
        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        ktime_t now;
+        s64 sock, send, status;
        spin_lock(&o2net_debug_lock);
        nst = next_nst(dummy_nst);
+        if (!nst)
+                goto out;
-        if (nst != NULL) {
+        now = ktime_get();
-                /* get_task_comm isn't exported.  oh well. */
+        sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
-                seq_printf(seq, "%p:\n"
+        send = ktime_to_us(ktime_sub(now, nst->st_send_time));
-                           "  pid:          %lu\n"
+        status = ktime_to_us(ktime_sub(now, nst->st_status_time));
-                           "  tgid:         %lu\n"
-                           "  process name: %s\n"
+        /* get_task_comm isn't exported.  oh well. */
-                           "  node:         %u\n"
+        seq_printf(seq, "%p:\n"
-                           "  sc:           %p\n"
+                   "  pid:          %lu\n"
-                           "  message id:   %d\n"
+                   "  tgid:         %lu\n"
-                           "  message type: %u\n"
+                   "  process name: %s\n"
-                           "  message key:  0x%08x\n"
+                   "  node:         %u\n"
-                           "  sock acquiry: %lu.%ld\n"
+                   "  sc:           %p\n"
-                           "  send start:   %lu.%ld\n"
+                   "  message id:   %d\n"
-                           "  wait start:   %lu.%ld\n",
+                   "  message type: %u\n"
-                           nst, (unsigned long)nst->st_task->pid,
+                   "  message key:  0x%08x\n"
-                           (unsigned long)nst->st_task->tgid,
+                   "  sock acquiry: %lld usecs ago\n"
-                           nst->st_task->comm, nst->st_node,
+                   "  send start:   %lld usecs ago\n"
-                           nst->st_sc, nst->st_id, nst->st_msg_type,
+                   "  wait start:   %lld usecs ago\n",
-                           nst->st_msg_key,
+                   nst, (unsigned long)task_pid_nr(nst->st_task),
-                           nst->st_sock_time.tv_sec,
+                   (unsigned long)nst->st_task->tgid,
-                           (long)nst->st_sock_time.tv_usec,
+                   nst->st_task->comm, nst->st_node,
-                           nst->st_send_time.tv_sec,
+                   nst->st_sc, nst->st_id, nst->st_msg_type,
-                           (long)nst->st_send_time.tv_usec,
+                   nst->st_msg_key,
-                           nst->st_status_time.tv_sec,
+                   (long long)sock,
-                           (long)nst->st_status_time.tv_usec);
+                   (long long)send,
-        }
+                   (long long)status);
+out:
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
        spin_unlock(&o2net_debug_lock);
 }
+struct o2net_sock_debug {
+        int dbg_ctxt;
+        struct o2net_sock_container *dbg_sock;
+};
 static struct o2net_sock_container
                        *next_sc(struct o2net_sock_container *sc_start)
 {
@@ -253,7 +268,8 @@ static struct o2net_sock_container
 static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        return sc; /* unused, just needs to be null when done */
 }
-#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s)              ((_s)->sc_send_count)
+# define sc_recv_count(_s)              ((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s)        (ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s)      (ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s)              (0U)
+# define sc_recv_count(_s)              (0U)
+# define sc_tv_acquiry_total_ns(_s)     (0LL)
+# define sc_tv_send_total_ns(_s)        (0LL)
+# define sc_tv_status_total_ns(_s)      (0LL)
+# define sc_tv_process_total_ns(_s)     (0LL)
+#endif
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION         1
+static void sc_show_sock_stats(struct seq_file *seq,
+                               struct o2net_sock_container *sc)
+{
+        if (!sc)
+                return;
+        seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+                   sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+                   (long long)sc_tv_acquiry_total_ns(sc),
+                   (long long)sc_tv_send_total_ns(sc),
+                   (long long)sc_tv_status_total_ns(sc),
+                   (unsigned long)sc_recv_count(sc),
+                   (long long)sc_tv_process_total_ns(sc));
+}
+static void sc_show_sock_container(struct seq_file *seq,
+                                   struct o2net_sock_container *sc)
+{
+        struct inet_sock *inet = NULL;
+        __be32 saddr = 0, daddr = 0;
+        __be16 sport = 0, dport = 0;
+        if (!sc)
+                return;
+        if (sc->sc_sock) {
+                inet = inet_sk(sc->sc_sock->sk);
+                /* the stack's structs aren't sparse endian clean */
+                saddr = (__force __be32)inet->inet_saddr;
+                daddr = (__force __be32)inet->inet_daddr;
+                sport = (__force __be16)inet->inet_sport;
+                dport = (__force __be16)inet->inet_dport;
+        }
+        /* XXX sigh, inet-> doesn't have sparse annotation so any
+         * use of it here generates a warning with -Wbitwise */
+        seq_printf(seq, "%p:\n"
+                   "  krefs:           %d\n"
+                   "  sock:            %pI4:%u -> "
+                                      "%pI4:%u\n"
+                   "  remote node:     %s\n"
+                   "  page off:        %zu\n"
+                   "  handshake ok:    %u\n"
+                   "  timer:           %lld usecs\n"
+                   "  data ready:      %lld usecs\n"
+                   "  advance start:   %lld usecs\n"
+                   "  advance stop:    %lld usecs\n"
+                   "  func start:      %lld usecs\n"
+                   "  func stop:       %lld usecs\n"
+                   "  func key:        0x%08x\n"
+                   "  func type:       %u\n",
+                   sc,
+                   atomic_read(&sc->sc_kref.refcount),
+                   &saddr, inet ? ntohs(sport) : 0,
+                   &daddr, inet ? ntohs(dport) : 0,
+                   sc->sc_node->nd_name,
+                   sc->sc_page_off,
+                   sc->sc_handshake_ok,
+                   (long long)ktime_to_us(sc->sc_tv_timer),
+                   (long long)ktime_to_us(sc->sc_tv_data_ready),
+                   (long long)ktime_to_us(sc->sc_tv_advance_start),
+                   (long long)ktime_to_us(sc->sc_tv_advance_stop),
+                   (long long)ktime_to_us(sc->sc_tv_func_start),
+                   (long long)ktime_to_us(sc->sc_tv_func_stop),
+                   sc->sc_msg_key,
+                   sc->sc_msg_type);
+}
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
-        if (sc != NULL) {
+        if (sc) {
-                struct inet_sock *inet = NULL;
+                if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+                        sc_show_sock_container(seq, sc);
-                __be32 saddr = 0, daddr = 0;
+                else
-                __be16 sport = 0, dport = 0;
+                        sc_show_sock_stats(seq, sc);
-                if (sc->sc_sock) {
-                        inet = inet_sk(sc->sc_sock->sk);
-                        /* the stack's structs aren't sparse endian clean */
-                        saddr = (__force __be32)inet->inet_saddr;
-                        daddr = (__force __be32)inet->inet_daddr;
-                        sport = (__force __be16)inet->inet_sport;
-                        dport = (__force __be16)inet->inet_dport;
-                }
-                /* XXX sigh, inet-> doesn't have sparse annotation so any
-                 * use of it here generates a warning with -Wbitwise */
-                seq_printf(seq, "%p:\n"
-                           "  krefs:           %d\n"
-                           "  sock:            %pI4:%u -> "
-                                              "%pI4:%u\n"
-                           "  remote node:     %s\n"
-                           "  page off:        %zu\n"
-                           "  handshake ok:    %u\n"
-                           "  timer:           %lu.%ld\n"
-                           "  data ready:      %lu.%ld\n"
-                           "  advance start:   %lu.%ld\n"
-                           "  advance stop:    %lu.%ld\n"
-                           "  func start:      %lu.%ld\n"
-                           "  func stop:       %lu.%ld\n"
-                           "  func key:        %u\n"
-                           "  func type:       %u\n",
-                           sc,
-                           atomic_read(&sc->sc_kref.refcount),
-                           &saddr, inet ? ntohs(sport) : 0,
-                           &daddr, inet ? ntohs(dport) : 0,
-                           sc->sc_node->nd_name,
-                           sc->sc_page_off,
-                           sc->sc_handshake_ok,
-                           TV_SEC_USEC(sc->sc_tv_timer),
-                           TV_SEC_USEC(sc->sc_tv_data_ready),
-                           TV_SEC_USEC(sc->sc_tv_advance_start),
-                           TV_SEC_USEC(sc->sc_tv_advance_stop),
-                           TV_SEC_USEC(sc->sc_tv_func_start),
-                           TV_SEC_USEC(sc->sc_tv_func_stop),
-                           sc->sc_msg_key,
-                           sc->sc_msg_type);
        }
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
        .show = sc_seq_show,
 };
-static int sc_fop_open(struct inode *inode, struct file *file)
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
 {
        struct o2net_sock_container *dummy_sc;
        struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
                goto out;
        seq = file->private_data;
-        seq->private = dummy_sc;
+        seq->private = sd;
+        sd->dbg_sock = dummy_sc;
        o2net_debug_add_sc(dummy_sc);
        dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
 static int sc_fop_release(struct inode *inode, struct file *file)
 {
        struct seq_file *seq = file->private_data;
-        struct o2net_sock_container *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *dummy_sc = sd->dbg_sock;
        o2net_debug_del_sc(dummy_sc);
        return seq_release_private(inode, file);
 }
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_STATS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
+static const struct file_operations stats_seq_fops = {
+        .open = stats_fop_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = sc_fop_release,
+};
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
 static const struct file_operations sc_seq_fops = {
        .open = sc_fop_open,
        .read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
                goto bail;
        }
+        stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                           o2net_dentry, NULL,
+                                           &stats_seq_fops);
+        if (!stats_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
        return 0;
 bail:
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
        return -ENOMEM;
 }
 void o2net_debugfs_exit(void)
 {
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9aa426e42123..3b11cb1e38fc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
        nst->st_node = node;
 }
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_sock_time);
+        nst->st_sock_time = ktime_get();
 }
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_send_time);
+        nst->st_send_time = ktime_get();
 }
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_status_time);
+        nst->st_status_time = ktime_get();
 }
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-                                         struct o2net_sock_container *sc)
+                                                struct o2net_sock_container *sc)
 {
        nst->st_sc = sc;
 }
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+                                        u32 msg_id)
 {
        nst->st_id = msg_id;
 }
-#else  /* CONFIG_DEBUG_FS */
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                                  u32 msgkey, struct task_struct *task, u8 node)
 {
+        sc->sc_tv_timer = ktime_get();
 }
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_data_ready = ktime_get();
 }
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_start = ktime_get();
 }
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_stop = ktime_get();
 }
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
-                                                struct o2net_sock_container *sc)
 {
+        sc->sc_tv_func_start = ktime_get();
 }
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
-                                        u32 msg_id)
 {
+        sc->sc_tv_func_stop = ktime_get();
 }
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+        return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+#else  /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+# define o2net_get_func_run_time(a)             (ktime_t)0
 #endif /* CONFIG_DEBUG_FS */
+#ifdef CONFIG_OCFS2_FS_STATS
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+                                    struct o2net_sock_container *sc)
+{
+        sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+                                           ktime_sub(ktime_get(),
+                                                     nst->st_status_time));
+        sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+                                         ktime_sub(nst->st_status_time,
+                                                   nst->st_send_time));
+        sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+                                            ktime_sub(nst->st_send_time,
+                                                      nst->st_sock_time));
+        sc->sc_send_count++;
+}
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+        sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+                                            o2net_get_func_run_time(sc));
+        sc->sc_recv_count++;
+}
+#else
+# define o2net_update_send_stats(a, b)
+# define o2net_update_recv_stats(sc)
+#endif /* CONFIG_OCFS2_FS_STATS */
 static inline int o2net_reconnect_delay(void)
 {
        return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
                sc->sc_sock = NULL;
        }
+        o2nm_undepend_item(&sc->sc_node->nd_item);
        o2nm_node_put(sc->sc_node);
        sc->sc_node = NULL;
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 {
        struct o2net_sock_container *sc, *ret = NULL;
        struct page *page = NULL;
+        int status = 0;
        page = alloc_page(GFP_NOFS);
        sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
        o2nm_node_get(node);
        sc->sc_node = node;
+        /* pin the node item of the remote node */
+        status = o2nm_depend_item(&node->nd_item);
+        if (status) {
+                mlog_errno(status);
+                o2nm_node_put(node);
+                goto out;
+        }
        INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
        INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
        INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
        if (sk->sk_user_data) {
                struct o2net_sock_container *sc = sk->sk_user_data;
                sclog(sc, "data_ready hit\n");
-                do_gettimeofday(&sc->sc_tv_data_ready);
+                o2net_set_data_ready_time(sc);
                o2net_sc_queue_work(sc, &sc->sc_rx_work);
                ready = sc->sc_data_ready;
        } else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        o2net_set_nst_status_time(&nst);
        wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+        o2net_update_send_stats(&nst, sc);
        /* Note that we avoid overwriting the callers status return
         * variable if a system error was reported on the other
         * side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
        if (syserr != O2NET_ERR_NONE)
                goto out_respond;
-        do_gettimeofday(&sc->sc_tv_func_start);
+        o2net_set_func_start_time(sc);
        sc->sc_msg_key = be32_to_cpu(hdr->key);
        sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
        handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
                                             be16_to_cpu(hdr->data_len),
                                        nmh->nh_func_data, &ret_data);
-        do_gettimeofday(&sc->sc_tv_func_stop);
+        o2net_set_func_stop_time(sc);
+        o2net_update_recv_stats(sc);
 out_respond:
        /* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
        size_t datalen;
        sclog(sc, "receiving\n");
-        do_gettimeofday(&sc->sc_tv_advance_start);
+        o2net_set_advance_start_time(sc);
        if (unlikely(sc->sc_handshake_ok == 0)) {
                if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
 out:
        sclog(sc, "ret = %d\n", ret);
-        do_gettimeofday(&sc->sc_tv_advance_stop);
+        o2net_set_advance_stop_time(sc);
        return ret;
 }
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
 {
        struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-        struct timeval now;
-        do_gettimeofday(&now);
+#ifdef CONFIG_DEBUG_FS
+        ktime_t now = ktime_get();
+#endif
        printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
                     o2net_idle_timeout() / 1000,
                     o2net_idle_timeout() % 1000);
-        mlog(ML_NOTICE, "here are some times that might help debug the "
-             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
+#ifdef CONFIG_DEBUG_FS
-             "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
+        mlog(ML_NOTICE, "Here are some times that might help debug the "
-             sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
+             "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
-             now.tv_sec, (long) now.tv_usec,
+             "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
-             sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
-             sc->sc_tv_advance_start.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_data_ready),
-             (long) sc->sc_tv_advance_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_advance_start),
-             sc->sc_tv_advance_stop.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_advance_stop),
-             (long) sc->sc_tv_advance_stop.tv_usec,
             sc->sc_msg_key, sc->sc_msg_type,
-             sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_func_start),
-             sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+             (long long)ktime_to_us(sc->sc_tv_func_stop));
+#endif
        /*
         * Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
        o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
        o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
                      msecs_to_jiffies(o2net_keepalive_delay()));
-        do_gettimeofday(&sc->sc_tv_timer);
+        o2net_set_sock_timer(sc);
        mod_timer(&sc->sc_idle_timeout,
               jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 15fdbdf9eb4b..4cbcb65784a3 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -166,18 +166,27 @@ struct o2net_sock_container {
        /* original handlers for the sockets */
        void                    (*sc_state_change)(struct sock *sk);
        void                    (*sc_data_ready)(struct sock *sk, int bytes);
-#ifdef CONFIG_DEBUG_FS
-        struct list_head        sc_net_debug_item;
-#endif
-        struct timeval          sc_tv_timer;
-        struct timeval          sc_tv_data_ready;
-        struct timeval          sc_tv_advance_start;
-        struct timeval          sc_tv_advance_stop;
-        struct timeval          sc_tv_func_start;
-        struct timeval          sc_tv_func_stop;
        u32                     sc_msg_key;
        u16                     sc_msg_type;
+#ifdef CONFIG_DEBUG_FS
+        struct list_head        sc_net_debug_item;
+        ktime_t                 sc_tv_timer;
+        ktime_t                 sc_tv_data_ready;
+        ktime_t                 sc_tv_advance_start;
+        ktime_t                 sc_tv_advance_stop;
+        ktime_t                 sc_tv_func_start;
+        ktime_t                 sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+        ktime_t                 sc_tv_acquiry_total;
+        ktime_t                 sc_tv_send_total;
+        ktime_t                 sc_tv_status_total;
+        u32                     sc_send_count;
+        u32                     sc_recv_count;
+        ktime_t                 sc_tv_process_total;
+#endif
        struct mutex            sc_send_lock;
 };
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
        u32                             st_msg_type;
        u32                             st_msg_key;
        u8                              st_node;
-        struct timeval                  st_sock_time;
+        ktime_t                         st_sock_time;
-        struct timeval                  st_send_time;
+        ktime_t                         st_send_time;
-        struct timeval                  st_status_time;
+        ktime_t                         st_status_time;
 };
 #else
 struct o2net_send_tracking {
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f44999156839..3a3ed4bb794b 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
+        res = lock->lockres;
        assert_spin_locked(&dlm->ast_lock);
        if (!list_empty(&lock->ast_list)) {
-                mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
+                mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+                     "AST list not empty, pending %d, newlevel %d\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                     lock->ast_pending, lock->ml.type);
                BUG();
        }
        if (lock->ast_pending)
-                mlog(0, "lock has an ast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        /* check to see if this ast obsoletes the bast */
        if (dlm_should_cancel_bast(dlm, lock)) {
-                struct dlm_lock_resource *res = lock->lockres;
+                mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
-                mlog(0, "%s: cancelling bast for %.*s\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
-                     dlm->name, res->lockname.len, res->lockname.name);
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
                lock->bast_pending = 0;
                list_del_init(&lock->bast_list);
                lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
        assert_spin_locked(&dlm->ast_lock);
+        res = lock->lockres;
        BUG_ON(!list_empty(&lock->bast_list));
        if (lock->bast_pending)
-                mlog(0, "lock has a bast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        dlm_astlockfunc_t *fn;
        struct dlm_lockstatus *lksb;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        struct dlm_lockstatus *lksb;
        int lksbflags;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 {
        dlm_bastlockfunc_t *fn = lock->bast;
-        mlog_entry_void();
        BUG_ON(lock->ml.node != dlm->node_num);
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+             dlm->name, res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+             blocked_type);
        (*fn)(lock->astdata, blocked_type);
 }
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        /* cannot get a proxy ast message if this node owns it */
        BUG_ON(res->owner == dlm->node_num);
-        mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
                list_move_tail(&lock->list, &res->granted);
-                mlog(0, "ast: Adding to granted list... type=%d, "
+                mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
-                     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+                     lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
                        lock->ml.type = lock->ml.convert_type;
                        lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        size_t veclen = 1;
        int status;
-        mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
+        mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
-                   res->lockname.len, res->lockname.name, lock->ml.node,
+             res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
-                   msg_type, blocked_type);
+             blocked_type);
        memset(&past, 0, sizeof(struct dlm_proxy_ast));
        past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        vec[0].iov_len = sizeof(struct dlm_proxy_ast);
        vec[0].iov_base = &past;
        if (flags & DLM_LKSB_GET_LVB) {
-                mlog(0, "returning requested LVB data\n");
                be32_add_cpu(&past.flags, LKM_GET_LVB);
                vec[1].iov_len = DLM_LVB_LEN;
                vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
-                     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                     dlm->name, res->lockname.len, res->lockname.name, ret,
                     lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index b36d0bf77a5a..4bdf7baee344 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 enum dlm_mle_type {
-        DLM_MLE_BLOCK,
+        DLM_MLE_BLOCK = 0,
-        DLM_MLE_MASTER,
+        DLM_MLE_MASTER = 1,
-        DLM_MLE_MIGRATION,
+        DLM_MLE_MIGRATION = 2,
-        DLM_MLE_NUM_TYPES
+        DLM_MLE_NUM_TYPES = 3,
 };
 struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
 enum dlm_ast_type {
        DLM_AST = 0,
-        DLM_BAST,
+        DLM_BAST = 1,
-        DLM_ASTUNLOCK
+        DLM_ASTUNLOCK = 2,
 };
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
 enum dlm_ctxt_state {
        DLM_CTXT_NEW = 0,
-        DLM_CTXT_JOINED,
+        DLM_CTXT_JOINED = 1,
-        DLM_CTXT_IN_SHUTDOWN,
+        DLM_CTXT_IN_SHUTDOWN = 2,
-        DLM_CTXT_LEAVING,
+        DLM_CTXT_LEAVING = 3,
 };
 struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
 enum dlm_lockres_list {
        DLM_GRANTED_LIST = 0,
-        DLM_CONVERTING_LIST,
+        DLM_CONVERTING_LIST = 1,
-        DLM_BLOCKED_LIST
+        DLM_BLOCKED_LIST = 2,
 };
 static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,27 +427,27 @@ struct dlm_node_iter
 enum {
-        DLM_MASTER_REQUEST_MSG    = 500,
+        DLM_MASTER_REQUEST_MSG          = 500,
-        DLM_UNUSED_MSG1,         /* 501 */
+        DLM_UNUSED_MSG1                 = 501,
-        DLM_ASSERT_MASTER_MSG,   /* 502 */
+        DLM_ASSERT_MASTER_MSG           = 502,
-        DLM_CREATE_LOCK_MSG,     /* 503 */
+        DLM_CREATE_LOCK_MSG             = 503,
-        DLM_CONVERT_LOCK_MSG,    /* 504 */
+        DLM_CONVERT_LOCK_MSG            = 504,
-        DLM_PROXY_AST_MSG,       /* 505 */
+        DLM_PROXY_AST_MSG               = 505,
-        DLM_UNLOCK_LOCK_MSG,     /* 506 */
+        DLM_UNLOCK_LOCK_MSG             = 506,
-        DLM_DEREF_LOCKRES_MSG,   /* 507 */
+        DLM_DEREF_LOCKRES_MSG           = 507,
-        DLM_MIGRATE_REQUEST_MSG, /* 508 */
+        DLM_MIGRATE_REQUEST_MSG         = 508,
-        DLM_MIG_LOCKRES_MSG,     /* 509 */
+        DLM_MIG_LOCKRES_MSG             = 509,
-        DLM_QUERY_JOIN_MSG,      /* 510 */
+        DLM_QUERY_JOIN_MSG              = 510,
-        DLM_ASSERT_JOINED_MSG,   /* 511 */
+        DLM_ASSERT_JOINED_MSG           = 511,
-        DLM_CANCEL_JOIN_MSG,     /* 512 */
+        DLM_CANCEL_JOIN_MSG             = 512,
-        DLM_EXIT_DOMAIN_MSG,     /* 513 */
+        DLM_EXIT_DOMAIN_MSG             = 513,
-        DLM_MASTER_REQUERY_MSG,  /* 514 */
+        DLM_MASTER_REQUERY_MSG          = 514,
-        DLM_LOCK_REQUEST_MSG,    /* 515 */
+        DLM_LOCK_REQUEST_MSG            = 515,
-        DLM_RECO_DATA_DONE_MSG,  /* 516 */
+        DLM_RECO_DATA_DONE_MSG          = 516,
-        DLM_BEGIN_RECO_MSG,      /* 517 */
+        DLM_BEGIN_RECO_MSG              = 517,
-        DLM_FINALIZE_RECO_MSG,   /* 518 */
+        DLM_FINALIZE_RECO_MSG           = 518,
-        DLM_QUERY_REGION,        /* 519 */
+        DLM_QUERY_REGION                = 519,
-        DLM_QUERY_NODEINFO,      /* 520 */
+        DLM_QUERY_NODEINFO              = 520,
 };
 struct dlm_reco_node_data
@@ -460,19 +460,19 @@ struct dlm_reco_node_data
 enum {
        DLM_RECO_NODE_DATA_DEAD = -1,
        DLM_RECO_NODE_DATA_INIT = 0,
-        DLM_RECO_NODE_DATA_REQUESTING,
+        DLM_RECO_NODE_DATA_REQUESTING = 1,
-        DLM_RECO_NODE_DATA_REQUESTED,
+        DLM_RECO_NODE_DATA_REQUESTED = 2,
-        DLM_RECO_NODE_DATA_RECEIVING,
+        DLM_RECO_NODE_DATA_RECEIVING = 3,
-        DLM_RECO_NODE_DATA_DONE,
+        DLM_RECO_NODE_DATA_DONE = 4,
-        DLM_RECO_NODE_DATA_FINALIZE_SENT,
+        DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
 };
 enum {
        DLM_MASTER_RESP_NO = 0,
-        DLM_MASTER_RESP_YES,
+        DLM_MASTER_RESP_YES = 1,
-        DLM_MASTER_RESP_MAYBE,
+        DLM_MASTER_RESP_MAYBE = 2,
-        DLM_MASTER_RESP_ERROR
+        DLM_MASTER_RESP_ERROR = 3,
 };
@@ -649,9 +649,9 @@ struct dlm_proxy_ast
 #define DLM_MOD_KEY (0x666c6172)
 enum dlm_query_join_response_code {
        JOIN_DISALLOW = 0,
-        JOIN_OK,
+        JOIN_OK = 1,
-        JOIN_OK_NO_MAP,
+        JOIN_OK_NO_MAP = 2,
-        JOIN_PROTOCOL_MISMATCH,
+        JOIN_PROTOCOL_MISMATCH = 3,
 };
 struct dlm_query_join_packet {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 272ec8631a51..04a32be0aeb9 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
        kref_get(&dc->debug_refcnt);
 }
-static struct debug_buffer *debug_buffer_allocate(void)
+static int debug_release(struct inode *inode, struct file *file)
 {
-        struct debug_buffer *db = NULL;
+        free_page((unsigned long)file->private_data);
+        return 0;
-        db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
-        if (!db)
-                goto bail;
-        db->len = PAGE_SIZE;
-        db->buf = kmalloc(db->len, GFP_KERNEL);
-        if (!db->buf)
-                goto bail;
-        return db;
-bail:
-        kfree(db);
-        return NULL;
-}
-static ssize_t debug_buffer_read(struct file *file, char __user *buf,
-                                 size_t nbytes, loff_t *ppos)
-{
-        struct debug_buffer *db = file->private_data;
-        return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
-}
-static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
-{
-        struct debug_buffer *db = file->private_data;
-        loff_t new = -1;
-        switch (whence) {
-        case 0:
-                new = off;
-                break;
-        case 1:
-                new = file->f_pos + off;
-                break;
-        }
-        if (new < 0 || new > db->len)
-                return -EINVAL;
-        return (file->f_pos = new);
 }
-static int debug_buffer_release(struct inode *inode, struct file *file)
+static ssize_t debug_read(struct file *file, char __user *buf,
+                          size_t nbytes, loff_t *ppos)
 {
-        struct debug_buffer *db = file->private_data;
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
-        if (db)
-                kfree(db->buf);
-        kfree(db);
-        return 0;
 }
 /* end - util funcs */
 /* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_lock_resource *res;
        int out = 0;
        unsigned long total = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping Purgelist for Domain: %s\n", dlm->name);
        spin_lock(&dlm->spinlock);
        list_for_each_entry(res, &dlm->purge_list, purge) {
                ++total;
-                if (db->len - out < 100)
+                if (len - out < 100)
                        continue;
                spin_lock(&res->spinlock);
                out += stringify_lockname(res->lockname.name,
                                          res->lockname.len,
-                                          db->buf + out, db->len - out);
+                                          buf + out, len - out);
-                out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+                out += snprintf(buf + out, len - out, "\t%ld\n",
                                (jiffies - res->last_used)/HZ);
                spin_unlock(&res->spinlock);
        }
        spin_unlock(&dlm->spinlock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
-                        "Total on list: %ld\n", total);
        return out;
 }
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_purgelist_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_purgelist_print(dlm, db);
+        i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -480,14 +434,14 @@ bail:
 static const struct file_operations debug_purgelist_fops = {
        .open =         debug_purgelist_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - purge list funcs */
 /* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_master_list_entry *mle;
        struct hlist_head *bucket;
@@ -495,7 +449,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        int i, out = 0;
        unsigned long total = 0, longest = 0, bucket_count = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
        spin_lock(&dlm->master_lock);
@@ -506,16 +460,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                                          master_hash_node);
                        ++total;
                        ++bucket_count;
-                        if (db->len - out < 200)
+                        if (len - out < 200)
                                continue;
-                        out += dump_mle(mle, db->buf + out, db->len - out);
+                        out += dump_mle(mle, buf + out, len - out);
                }
                longest = max(longest, bucket_count);
                bucket_count = 0;
        }
        spin_unlock(&dlm->master_lock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Total: %ld, Longest: %ld\n", total, longest);
        return out;
 }
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_mle_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_mle_print(dlm, db);
+        i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -540,9 +494,9 @@ bail:
 static const struct file_operations debug_mle_fops = {
        .open =         debug_mle_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
 /* end - debug lockres funcs */
 /* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        int out = 0;
        struct dlm_reco_node_data *node;
@@ -781,35 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        }
        /* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
                        dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
                        dlm->dlm_locking_proto.pv_minor);
        /* Thread Pid: xxx  Node: xxx  State: xxxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Thread Pid: %d  Node: %d  State: %s\n",
-                        dlm->dlm_thread_task->pid, dlm->node_num, state);
+                        task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
        /* Number of Joins: xxx  Joining Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Number of Joins: %d  Joining Node: %d\n",
                        dlm->num_joins, dlm->joining_node);
        /* Domain Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+        out += snprintf(buf + out, len - out, "Domain Map: ");
        out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Live Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+        out += snprintf(buf + out, len - out, "Live Map: ");
        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Lock Resources: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lock Resources: %d (%d)\n",
                        atomic_read(&dlm->res_cur_count),
                        atomic_read(&dlm->res_tot_count));
@@ -821,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                cur_mles += atomic_read(&dlm->mle_cur_count[i]);
        /* MLEs: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "MLEs: %d (%d)\n", cur_mles, tot_mles);
        /*  Blocking: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Blocking: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
        /*  Mastery: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Mastery: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
        /*  Migration: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Migration: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
                        "PendingBASTs=%s\n",
                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -852,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
        /* Purge Count: xxx  Refs: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Purge Count: %d  Refs: %d\n", dlm->purge_count,
                        atomic_read(&dlm->dlm_refs.refcount));
        /* Dead Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dead Node: %d\n", dlm->reco.dead_node);
        /* What about DLM_RECO_STATE_FINALIZE? */
@@ -867,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                state = "INACTIVE";
        /* Recovery Pid: xxxx  Master: xxx  State: xxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Recovery Pid: %d  Master: %d  State: %s\n",
-                        dlm->dlm_reco_thread_task->pid,
+                        task_pid_nr(dlm->dlm_reco_thread_task),
                        dlm->reco.new_master, state);
        /* Recovery Map: xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+        out += snprintf(buf + out, len - out, "Recovery Map: ");
        out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Recovery Node State: */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+        out += snprintf(buf + out, len - out, "Recovery Node State:\n");
        list_for_each_entry(node, &dlm->reco.node_data, list) {
                switch (node->state) {
                case DLM_RECO_NODE_DATA_INIT:
@@ -907,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        state = "BAD";
                        break;
                }
-                out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+                out += snprintf(buf + out, len - out, "\t%u - %s\n",
                                node->node_num, state);
        }
@@ -919,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_state_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db = NULL;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_state_print(dlm, db);
+        i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -936,9 +890,9 @@ bail:
 static const struct file_operations debug_state_fops = {
        .open =         debug_state_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end  - debug state funcs */
@@ -1002,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
        struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
        if (dc) {
-                if (dc->debug_purgelist_dentry)
+                debugfs_remove(dc->debug_purgelist_dentry);
-                        debugfs_remove(dc->debug_purgelist_dentry);
+                debugfs_remove(dc->debug_mle_dentry);
-                if (dc->debug_mle_dentry)
+                debugfs_remove(dc->debug_lockres_dentry);
-                        debugfs_remove(dc->debug_mle_dentry);
+                debugfs_remove(dc->debug_state_dentry);
-                if (dc->debug_lockres_dentry)
-                        debugfs_remove(dc->debug_lockres_dentry);
-                if (dc->debug_state_dentry)
-                        debugfs_remove(dc->debug_state_dentry);
                dlm_debug_put(dc);
        }
 }
@@ -1040,8 +990,7 @@ bail:
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-        if (dlm->dlm_debugfs_subroot)
+        debugfs_remove(dlm->dlm_debugfs_subroot);
-                debugfs_remove(dlm->dlm_debugfs_subroot);
 }
 /* debugfs root */
@@ -1057,7 +1006,6 @@ int dlm_create_debugfs_root(void)
 void dlm_destroy_debugfs_root(void)
 {
-        if (dlm_debugfs_root)
+        debugfs_remove(dlm_debugfs_root);
-                debugfs_remove(dlm_debugfs_root);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c7..1f27c4812d1a 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
        struct dentry *debug_purgelist_dentry;
 };
-struct debug_buffer {
-        int len;
-        char *buf;
-};
 struct debug_lockres {
        int dl_len;
        char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index cc2aaa96cfe5..7e38a072d720 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -460,8 +460,6 @@ redo_bucket:
                }
                cond_resched_lock(&dlm->spinlock);
                num += n;
-                mlog(0, "%s: touched %d lockreses in bucket %d "
-                     "(tot=%d)\n", dlm->name, n, i, num);
        }
        spin_unlock(&dlm->spinlock);
        wake_up(&dlm->dlm_thread_wq);
@@ -1661,8 +1659,8 @@ bail:
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
        o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
@@ -1674,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
        if (status)
                goto bail;
        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
        if (status)
                goto bail;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369961c4..7009292aac5a 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
                if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
                        return 0;
+                if (!dlm_lock_compatible(tmplock->ml.convert_type,
+                                         lock->ml.type))
+                        return 0;
        }
        return 1;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf33d9b..1d6d1d22c471 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
 void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                              struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
        if (__dlm_lockres_unused(res)){
                if (list_empty(&res->purge)) {
-                        mlog(0, "putting lockres %.*s:%p onto purge list\n",
+                        mlog(0, "%s: Adding res %.*s to purge list\n",
-                             res->lockname.len, res->lockname.name, res);
+                             dlm->name, res->lockname.len, res->lockname.name);
                        res->last_used = jiffies;
                        dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                        dlm->purge_count++;
                }
        } else if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
+                mlog(0, "%s: Removing res %.*s from purge list\n",
-                     res->lockname.len, res->lockname.name, res, res->owner);
+                     dlm->name, res->lockname.len, res->lockname.name);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                            struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        spin_lock(&dlm->spinlock);
        spin_lock(&res->spinlock);
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
        master = (res->owner == dlm->node_num);
+        mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
-        mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
+             res->lockname.len, res->lockname.name, master);
-             res->lockname.name, master);
        if (!master) {
                res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                /* clear our bit from the master's refmap, ignore errors */
                ret = dlm_drop_lockres_ref(dlm, res);
                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
+                             res->lockname.len, res->lockname.name, ret);
                        if (!dlm_is_host_down(ret))
                                BUG();
                }
-                mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
-                     dlm->name, res->lockname.len, res->lockname.name, ret);
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
        }
        if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purgelist, "
+                mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
-                     "master = %d\n", res->lockname.len, res->lockname.name,
+                     dlm->name, res->lockname.len, res->lockname.name, master);
-                     res, master);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
                dlm->purge_count--;
        }
        if (!__dlm_lockres_unused(res)) {
-                mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+                mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
                     dlm->name, res->lockname.len, res->lockname.name);
                __dlm_print_one_lock_resource(res);
                BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                unused = __dlm_lockres_unused(lockres);
                if (!unused ||
                    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
-                        mlog(0, "lockres %s:%.*s: is in use or "
+                        mlog(0, "%s: res %.*s is in use or being remastered, "
-                             "being remastered, used %d, state %d\n",
+                             "used %d, state %d\n", dlm->name,
-                             dlm->name, lockres->lockname.len,
+                             lockres->lockname.len, lockres->lockname.name,
-                             lockres->lockname.name, !unused, lockres->state);
+                             !unused, lockres->state);
                        list_move_tail(&dlm->purge_list, &lockres->purge);
                        spin_unlock(&lockres->spinlock);
                        continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
        struct list_head *head;
        int can_grant = 1;
-        //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
+        /*
-        //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
+         * Because this function is called with the lockres
-        //mlog(0, "shuffle res %.*s\n", res->lockname.len,
-        //        res->lockname.name);
-        /* because this function is called with the lockres
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
-         * basts right before queueing them all throughout */
+         * basts right before queueing them all throughout
+         */
        assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 converting:
        if (list_empty(&res->converting))
                goto blocked;
-        mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
+        mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
-             res->lockname.name);
+             res->lockname.len, res->lockname.name);
        target = list_entry(res->converting.next, struct dlm_lock, list);
        if (target->ml.convert_type == LKM_IVMODE) {
-                mlog(ML_ERROR, "%.*s: converting a lock with no "
+                mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
-                     "convert_type!\n", res->lockname.len, res->lockname.name);
+                     dlm->name, res->lockname.len, res->lockname.name);
                BUG();
        }
        head = &res->granted;
@@ -365,9 +356,12 @@ converting:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for converting lock: %.*s, have: %d, "
+                mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
-                     "granting: %d, node: %u\n", res->lockname.len,
+                     "%d => %d, node %u\n", dlm->name, res->lockname.len,
-                     res->lockname.name, target->ml.type,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+                     target->ml.type,
                     target->ml.convert_type, target->ml.node);
                target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
+                mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
-                     "node: %u\n", res->lockname.len, res->lockname.name,
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
                     target->ml.type, target->ml.node);
-                // target->ml.type is already correct
+                /* target->ml.type is already correct */
                list_move_tail(&target->list, &res->granted);
                BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
 /* must have NO locks when calling this with res !=NULL * */
 void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        if (res) {
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
                        res->state |= DLM_LOCK_RES_DIRTY;
                }
        }
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
 }
 /* Launch the NM thread for the mounted volume */
 int dlm_launch_thread(struct dlm_ctxt *dlm)
 {
-        mlog(0, "starting dlm thread...\n");
+        mlog(0, "Starting dlm_thread...\n");
        dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
        if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
 void dlm_complete_thread(struct dlm_ctxt *dlm)
 {
        if (dlm->dlm_thread_task) {
-                mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+                mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
                kthread_stop(dlm->dlm_thread_task);
                dlm->dlm_thread_task = NULL;
        }
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* get an extra ref on lock */
                dlm_lock_get(lock);
                res = lock->lockres;
-                mlog(0, "delivering an ast for this lockres\n");
+                mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     lock->ml.type, lock->ml.node);
                BUG_ON(!lock->ast_pending);
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another ast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->ast_list)) {
-                        mlog(0, "aha another ast got queued while "
+                        mlog(0, "%s: res %.*s, AST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the ast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->ast_pending = 0;
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                dlm_lock_put(lock);
                spin_unlock(&dlm->ast_lock);
-                mlog(0, "delivering a bast for this lockres "
+                mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
-                     "(blocked = %d\n", hi);
+                     "blocked %d, node %u\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     hi, lock->ml.node);
                if (lock->ml.node != dlm->node_num) {
                        ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another bast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->bast_list)) {
-                        mlog(0, "aha another bast got queued while "
+                        mlog(0, "%s: res %.*s, BAST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the bast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->bast_pending = 0;
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
-                                mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
+                                mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
-                                     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
+                                     " dirty %d\n", dlm->name,
-                                     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
-                                     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_MIGRATING),
-                                     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+                                     !!(res->state & DLM_LOCK_RES_RECOVERING),
+                                     !!(res->state & DLM_LOCK_RES_DIRTY));
                        }
                        BUG_ON(res->owner != dlm->node_num);
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
                                spin_unlock(&dlm->ast_lock);
-                                mlog(0, "delaying list shuffling for in-"
+                                mlog(0, "%s: res %.*s, inprogress, delay list "
-                                     "progress lockres %.*s, state=%d\n",
+                                     "shuffle, state %d\n", dlm->name,
                                     res->lockname.len, res->lockname.name,
                                     res->state);
                                delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
                         * spinlock and do NOT have the dlm lock.
                         * safe to reserve/queue asts and run the lists. */
-                        mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
-                             "res=%.*s\n", dlm->name,
-                             res->lockname.len, res->lockname.name);
                        /* called while holding lockres lock */
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
                        /* unlikely, but we may need to give time to
                         * other tasks */
                        if (!--n) {
-                                mlog(0, "throttling dlm_thread\n");
+                                mlog(0, "%s: Throttling dlm thread\n",
+                                     dlm->name);
                                break;
                        }
                }
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 6adafa576065..5dbc3062b4fd 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -137,9 +137,7 @@ check_gen:
        }
        result = d_obtain_alias(inode);
-        if (!IS_ERR(result))
+        if (IS_ERR(result))
-                d_set_d_op(result, &ocfs2_dentry_ops);
-        else
                mlog_errno(PTR_ERR(result));
 bail:
@@ -175,8 +173,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        }
        parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
-        if (!IS_ERR(parent))
-                d_set_d_op(parent, &ocfs2_dentry_ops);
 bail_unlock:
        ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bdadbae09094..a6651956482e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1989,28 +1989,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
        return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
                            loff_t len)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_space_resv sr;
        int change_size = 1;
+        int cmd = OCFS2_IOC_RESVSP64;
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
        if (!ocfs2_writes_unwritten_extents(osb))
                return -EOPNOTSUPP;
-        if (S_ISDIR(inode->i_mode))
-                return -ENODEV;
        if (mode & FALLOC_FL_KEEP_SIZE)
                change_size = 0;
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                cmd = OCFS2_IOC_UNRESVSP64;
        sr.l_whence = 0;
        sr.l_start = (s64)offset;
        sr.l_len = (s64)len;
-        return __ocfs2_change_file_space(NULL, inode, offset,
+        return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
-                                         OCFS2_IOC_RESVSP64, &sr, change_size);
+                                         change_size);
 }
 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -2606,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
-        .fallocate      = ocfs2_fallocate,
        .fiemap         = ocfs2_fiemap,
 };
@@ -2638,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+        .fallocate      = ocfs2_fallocate,
 };
 const struct file_operations ocfs2_dops = {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f935fd6600dd..4068c6c4c6f6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -434,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
         * #1 and #2 can be simply solved by never taking the lock
         * here for system files (which are the only type we read
         * during mount). It's a heavier approach, but our main
-         * concern is user-accesible files anyway.
+         * concern is user-accessible files anyway.
         *
         * #3 works itself out because we'll eventually take the
         * cluster lock before trusting anything anyway.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d14cad6e2e41..849fb4a2e814 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        spin_unlock(&oi->ip_lock);
 bail_add:
-        d_set_d_op(dentry, &ocfs2_dentry_ops);
        ret = d_splice_alias(inode, dentry);
        if (inode) {
@@ -415,7 +414,6 @@ static int ocfs2_mknod(struct inode *dir,
                mlog_errno(status);
                goto leave;
        }
-        d_set_d_op(dentry, &ocfs2_dentry_ops);
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -743,7 +741,6 @@ static int ocfs2_link(struct dentry *old_dentry,
        }
        ihold(inode);
-        d_set_d_op(dentry, &ocfs2_dentry_ops);
        d_instantiate(dentry, inode);
 out_commit:
@@ -1017,8 +1014,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                 * An error return must mean that no cluster locks
                 * were held on function exit.
                 */
-                if (oi1->ip_blkno != oi2->ip_blkno)
+                if (oi1->ip_blkno != oi2->ip_blkno) {
                        ocfs2_inode_unlock(inode2, 1);
+                        brelse(*bh2);
+                        *bh2 = NULL;
+                }
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1794,7 +1794,6 @@ static int ocfs2_symlink(struct inode *dir,
                mlog_errno(status);
                goto bail;
        }
-        d_set_d_op(dentry, &ocfs2_dentry_ops);
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2459,7 +2458,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                goto out_commit;
        }
-        d_set_d_op(dentry, &ocfs2_dentry_ops);
        d_instantiate(dentry, inode);
        status = 0;
 out_commit:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 70dd3b1798f1..51cd6898e7f1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -420,6 +420,11 @@ struct ocfs2_super
        struct inode                    *osb_tl_inode;
        struct buffer_head              *osb_tl_bh;
        struct delayed_work             osb_truncate_log_wq;
+        /*
+         * How many clusters in our truncate log.
+         * It must be protected by osb_tl_inode->i_mutex.
+         */
+        unsigned int truncated_clusters;
        struct ocfs2_node_map           osb_recovering_orphan_dirs;
        unsigned int                    *osb_orphan_wipes;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5fed60de7630..71998d4d61d5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1916,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
        if (res->sr_bg_blkno) {
                /* Attempt to short-circuit the usual search mechanism
                 * by jumping straight to the most recently used
-                 * allocation group. This helps us mantain some
+                 * allocation group. This helps us maintain some
                 * contiguousness across allocations. */
                status = ocfs2_search_one_group(ac, handle, bits_wanted,
                                                min_bits, res, &bits_left);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 17ff46fa8a10..38f986d2447e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -993,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 }
 /* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
-                          char *path)
 {
        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -1013,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
 }
 static const struct quotactl_ops ocfs2_quotactl_ops = {
-        .quota_on       = ocfs2_quota_on,
+        .quota_on_meta  = ocfs2_quota_on,
        .quota_off      = ocfs2_quota_off,
        .quota_sync     = dquot_quota_sync,
        .get_info       = dquot_get_dqinfo,
@@ -2097,6 +2096,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
+        sb->s_d_op = &ocfs2_dentry_ops;
        sb->s_export_op = &ocfs2_export_ops;
        sb->s_qcop = &ocfs2_quotactl_ops;
        sb->dq_op = &ocfs2_quota_operations;
diff --git a/fs/open.c b/fs/open.c
index 4197b9ed023d..e52389e1f05b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                return -EINVAL;
        /* Return error if mode is not supported */
-        if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
+        /* Punch hole must have keep size set */
+        if ((mode & FALLOC_FL_PUNCH_HOLE) &&
+            !(mode & FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;
        if (!(file->f_mode & FMODE_WRITE))
@@ -250,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                return -EFBIG;
-        if (!inode->i_op->fallocate)
+        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;
-        return inode->i_op->fallocate(inode, mode, offset, len);
+        return file->f_op->fallocate(file, mode, offset, len);
 }
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad0c7e2..9c21119512b9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev,
        return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
+ssize_t part_ro_show(struct device *dev,
+                       struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+}
 ssize_t part_alignment_offset_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
 {
@@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
                   NULL);
@@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_partition.attr,
        &dev_attr_start.attr,
        &dev_attr_size.attr,
+        &dev_attr_ro.attr,
        &dev_attr_alignment_offset.attr,
        &dev_attr_discard_alignment.attr,
        &dev_attr_stat.attr,
@@ -372,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
        put_device(part_to_dev(part));
 }
+void __delete_partition(struct hd_struct *part)
+{
+        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
 void delete_partition(struct gendisk *disk, int partno)
 {
        struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -390,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
        kobject_put(part->holder_dir);
        device_del(part_to_dev(part));
-        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+        hd_struct_put(part);
 }
 static ssize_t whole_disk_show(struct device *dev,
@@ -489,6 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        if (!dev_get_uevent_suppress(ddev))
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
+        hd_ref_init(p);
        return p;
 out_free_info:
@@ -507,65 +522,6 @@ out_put:
        return ERR_PTR(err);
 }
-/* Not exported, helper to add_disk(). */
-void register_disk(struct gendisk *disk)
-{
-        struct device *ddev = disk_to_dev(disk);
-        struct block_device *bdev;
-        struct disk_part_iter piter;
-        struct hd_struct *part;
-        int err;
-        ddev->parent = disk->driverfs_dev;
-        dev_set_name(ddev, disk->disk_name);
-        /* delay uevents, until we scanned partition table */
-        dev_set_uevent_suppress(ddev, 1);
-        if (device_add(ddev))
-                return;
-        if (!sysfs_deprecated) {
-                err = sysfs_create_link(block_depr, &ddev->kobj,
-                                        kobject_name(&ddev->kobj));
-                if (err) {
-                        device_del(ddev);
-                        return;
-                }
-        }
-        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
-        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-        /* No minors to use for partitions */
-        if (!disk_partitionable(disk))
-                goto exit;
-        /* No such device (e.g., media were just removed) */
-        if (!get_capacity(disk))
-                goto exit;
-        bdev = bdget_disk(disk, 0);
-        if (!bdev)
-                goto exit;
-        bdev->bd_invalidated = 1;
-        err = blkdev_get(bdev, FMODE_READ);
-        if (err < 0)
-                goto exit;
-        blkdev_put(bdev, FMODE_READ);
-exit:
-        /* announce disk after possible partitions are created */
-        dev_set_uevent_suppress(ddev, 0);
-        kobject_uevent(&ddev->kobj, KOBJ_ADD);
-        /* announce possible partitions */
-        disk_part_iter_init(&piter, disk, 0);
-        while ((part = disk_part_iter_next(&piter)))
-                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-        disk_part_iter_exit(&piter);
-}
 static bool disk_unlock_native_capacity(struct gendisk *disk)
 {
        const struct block_device_operations *bdops = disk->fops;
@@ -728,33 +684,3 @@ fail:
 }
 EXPORT_SYMBOL(read_dev_sector);
-void del_gendisk(struct gendisk *disk)
-{
-        struct disk_part_iter piter;
-        struct hd_struct *part;
-        /* invalidate stuff */
-        disk_part_iter_init(&piter, disk,
-                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
-        while ((part = disk_part_iter_next(&piter))) {
-                invalidate_partition(disk, part->partno);
-                delete_partition(disk, part->partno);
-        }
-        disk_part_iter_exit(&piter);
-        invalidate_partition(disk, 0);
-        blk_free_devt(disk_to_dev(disk)->devt);
-        set_capacity(disk, 0);
-        disk->flags &= ~GENHD_FL_UP;
-        unlink_gendisk(disk);
-        part_stat_set_all(&disk->part0, 0);
-        disk->part0.stamp = 0;
-        kobject_put(disk->part0.holder_dir);
-        kobject_put(disk->slave_dir);
-        disk->driverfs_dev = NULL;
-        if (!sysfs_deprecated)
-                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-        device_del(disk_to_dev(disk));
-}
diff --git a/fs/pipe.c b/fs/pipe.c
index 68f1f8e4e23b..da42f7db50de 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -441,7 +441,7 @@ redo:
                        break;
                }
                if (do_wakeup) {
-                        wake_up_interruptible_sync(&pipe->wait);
+                        wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
                }
                pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
        /* Signal writers asynchronously that there is more room. */
        if (do_wakeup) {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
        if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
                        break;
                }
                if (do_wakeup) {
-                        wake_up_interruptible_sync(&pipe->wait);
+                        wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                        do_wakeup = 0;
                }
@@ -623,7 +623,7 @@ redo2:
 out:
        mutex_unlock(&inode->i_mutex);
        if (do_wakeup) {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        }
        if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
        if (!pipe->readers && !pipe->writers) {
                free_pipe_info(inode);
        } else {
-                wake_up_interruptible_sync(&pipe->wait);
+                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
@@ -1004,7 +1004,6 @@ struct file *create_write_pipe(int flags)
                goto err_inode;
        path.mnt = mntget(pipe_mnt);
-        d_set_d_op(path.dentry, &pipefs_dentry_operations);
        d_instantiate(path.dentry, inode);
        err = -ENFILE;
@@ -1266,7 +1265,8 @@ static const struct super_operations pipefs_ops = {
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
                         int flags, const char *dev_name, void *data)
 {
-        return mount_pseudo(fs_type, "pipe:", &pipefs_ops, PIPEFS_MAGIC);
+        return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+                        &pipefs_dentry_operations, PIPEFS_MAGIC);
 }
 static struct file_system_type pipe_fs_type = {
@@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void)
 static void __exit exit_pipe_fs(void)
 {
        unregister_filesystem(&pipe_fs_type);
-        mntput_long(pipe_mnt);
+        mntput(pipe_mnt);
 }
 fs_initcall(init_pipe_fs);
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 6a0068841d96..15af6222f8a4 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
 config PROC_FS
-        bool "/proc file system support" if EMBEDDED
+        bool "/proc file system support" if EXPERT
        default y
        help
          This is a virtual file system providing information about the status
@@ -40,7 +40,7 @@ config PROC_VMCORE
        Exports the dump image of crashed kernel in ELF format.
 config PROC_SYSCTL
-        bool "Sysctl support (/proc/sys)" if EMBEDDED
+        bool "Sysctl support (/proc/sys)" if EXPERT
        depends on PROC_FS
        select SYSCTL
        default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
 config PROC_PAGE_MONITOR
        default y
        depends on PROC_FS && MMU
-        bool "Enable /proc page monitoring" if EMBEDDED
+        bool "Enable /proc page monitoring" if EXPERT
        help
          Various /proc files exist to monitor process memory utilization:
          /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 288a49e098bf..df434c5f28fb 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,12 +10,12 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
                proc_tty.o
 proc-y  += cmdline.o
+proc-y  += consoles.o
 proc-y  += cpuinfo.o
 proc-y  += devices.o
 proc-y  += interrupts.o
 proc-y  += loadavg.o
 proc-y  += meminfo.o
-proc-y  += proc_console.o
 proc-y  += stat.o
 proc-y  += uptime.o
 proc-y  += version.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fff6572676ae..df2b703b9d0f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
        get_task_comm(tcomm, p);
-        seq_printf(m, "Name:\t");
+        seq_puts(m, "Name:\t");
        end = m->buf + m->size;
        buf = m->buf + m->count;
        name = tcomm;
@@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
                buf++;
        }
        m->count = buf - m->buf;
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 /*
@@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                seq_printf(m, "%d ", GROUP_AT(group_info, g));
        put_cred(cred);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static void render_sigset_t(struct seq_file *m, const char *header,
@@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
 {
        int i;
-        seq_printf(m, "%s", header);
+        seq_puts(m, header);
        i = _NSIG;
        do {
@@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
                seq_printf(m, "%x", x);
        } while (i >= 4);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
 {
        unsigned __capi;
-        seq_printf(m, "%s", header);
+        seq_puts(m, header);
        CAP_FOR_EACH_U32(__capi) {
                seq_printf(m, "%08x",
                           a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
        }
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
@@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_printf(m, "Cpus_allowed:\t");
+        seq_puts(m, "Cpus_allowed:\t");
        seq_cpumask(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
-        seq_printf(m, "Cpus_allowed_list:\t");
+        seq_puts(m, "Cpus_allowed_list:\t");
        seq_cpumask_list(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
+        seq_putc(m, '\n');
 }
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
 int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
 {
-        int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
+        unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
        struct mm_struct *mm = get_task_mm(task);
        if (mm) {
                size = task_statm(mm, &shared, &text, &data, &resident);
                mmput(mm);
        }
-        seq_printf(m, "%d %d %d %d %d %d %d\n",
+        seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
-                        size, resident, shared, text, lib, data, 0);
+                        size, resident, shared, text, data);
        return 0;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b20962c71a52..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
                return -ESRCH;
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < 32; i++) {
-                if (task->latency_record[i].backtrace[0]) {
+                struct latency_record *lr = &task->latency_record[i];
+                if (lr->backtrace[0]) {
                        int q;
-                        seq_printf(m, "%i %li %li ",
+                        seq_printf(m, "%i %li %li",
-                                task->latency_record[i].count,
+                                   lr->count, lr->time, lr->max);
-                                task->latency_record[i].time,
-                                task->latency_record[i].max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-                                char sym[KSYM_SYMBOL_LEN];
+                                unsigned long bt = lr->backtrace[q];
-                                char *c;
+                                if (!bt)
-                                if (!task->latency_record[i].backtrace[q])
                                        break;
-                                if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+                                if (bt == ULONG_MAX)
                                        break;
-                                sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+                                seq_printf(m, " %ps", (void *)bt);
-                                c = strchr(sym, '+');
-                                if (c)
-                                        *c = 0;
-                                seq_printf(m, "%s ", sym);
                        }
-                        seq_printf(m, "\n");
+                        seq_putc(m, '\n');
                }
        }
@@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v)
 static int proc_single_open(struct inode *inode, struct file *filp)
 {
-        int ret;
+        return single_open(filp, proc_single_show, inode);
-        ret = single_open(filp, proc_single_show, NULL);
-        if (!ret) {
-                struct seq_file *m = filp->private_data;
-                m->private = inode;
-        }
-        return ret;
 }
 static const struct file_operations proc_single_file_operations = {
@@ -1164,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto err_task_lock;
        }
-        if (oom_score_adj < task->signal->oom_score_adj &&
+        if (oom_score_adj < task->signal->oom_score_adj_min &&
                        !capable(CAP_SYS_RESOURCE)) {
                err = -EACCES;
                goto err_sighand;
@@ -1177,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                        atomic_dec(&task->mm->oom_disable_count);
        }
        task->signal->oom_score_adj = oom_score_adj;
+        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+                task->signal->oom_score_adj_min = oom_score_adj;
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
@@ -1386,15 +1375,7 @@ sched_write(struct file *file, const char __user *buf,
 static int sched_open(struct inode *inode, struct file *filp)
 {
-        int ret;
+        return single_open(filp, sched_show, inode);
-        ret = single_open(filp, sched_show, NULL);
-        if (!ret) {
-                struct seq_file *m = filp->private_data;
-                m->private = inode;
-        }
-        return ret;
 }
 static const struct file_operations proc_pid_sched_operations = {
@@ -1530,15 +1511,7 @@ static int comm_show(struct seq_file *m, void *v)
 static int comm_open(struct inode *inode, struct file *filp)
 {
-        int ret;
+        return single_open(filp, comm_show, inode);
-        ret = single_open(filp, comm_show, NULL);
-        if (!ret) {
-                struct seq_file *m = filp->private_data;
-                m->private = inode;
-        }
-        return ret;
 }
 static const struct file_operations proc_pid_set_comm_operations = {
diff --git a/fs/proc/proc_console.c b/fs/proc/consoles.c
index 8a707609f528..eafc22ab1fdd 100644
--- a/fs/proc/proc_console.c
+++ b/fs/proc/consoles.c
@@ -106,9 +106,9 @@ static const struct file_operations proc_consoles_operations = {
        .release        = seq_release,
 };
-static int register_proc_consoles(void)
+static int __init proc_consoles_init(void)
 {
        proc_create("consoles", 0, NULL, &proc_consoles_operations);
        return 0;
 }
-module_init(register_proc_consoles);
+module_init(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c9..b14347167c35 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
        if (i < CHRDEV_MAJOR_HASH_SIZE) {
                if (i == 0)
-                        seq_printf(f, "Character devices:\n");
+                        seq_puts(f, "Character devices:\n");
                chrdev_show(f, i);
        }
 #ifdef CONFIG_BLOCK
        else {
                i -= CHRDEV_MAJOR_HASH_SIZE;
                if (i == 0)
-                        seq_printf(f, "\nBlock devices:\n");
+                        seq_puts(f, "\nBlock devices:\n");
                blkdev_show(f, i);
        }
 #endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f766be29d2c7..01e07f2a188f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                if (de->namelen != dentry->d_name.len)
                        continue;
                if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-                        unsigned int ino;
-                        ino = de->low_ino;
                        pde_get(de);
                        spin_unlock(&proc_subdir_lock);
                        error = -EINVAL;
-                        inode = proc_get_inode(dir->i_sb, ino, de);
+                        inode = proc_get_inode(dir->i_sb, de);
                        goto out_unlock;
                }
        }
@@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
 static void free_proc_entry(struct proc_dir_entry *de)
 {
-        unsigned int ino = de->low_ino;
+        release_inode_number(de->low_ino);
-        if (ino < PROC_DYNAMIC_FIRST)
-                return;
-        release_inode_number(ino);
        if (S_ISLNK(de->mode))
                kfree(de->data);
@@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
                wait_for_completion(de->pde_unload_completion);
-                goto continue_removing;
+                spin_lock(&de->pde_unload_lock);
        }
-        spin_unlock(&de->pde_unload_lock);
-continue_removing:
-        spin_lock(&de->pde_unload_lock);
        while (!list_empty(&de->pde_openers)) {
                struct pde_opener *pdeo;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6bcb926b101b..176ce4cda68a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -416,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
-struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
-                                struct proc_dir_entry *de)
 {
        struct inode * inode;
-        inode = iget_locked(sb, ino);
+        inode = iget_locked(sb, de->low_ino);
        if (!inode)
                return NULL;
        if (inode->i_state & I_NEW) {
@@ -471,7 +470,7 @@ int proc_fill_super(struct super_block *s)
        s->s_time_gran = 1;
        
        pde_get(&proc_root);
-        root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
+        root_inode = proc_get_inode(s, &proc_root);
        if (!root_inode)
                goto out_no_root;
        root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd12..9ad561ded409 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock;
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
+unsigned long task_statm(struct mm_struct *,
+        unsigned long *, unsigned long *, unsigned long *, unsigned long *);
 void task_mem(struct seq_file *, struct mm_struct *);
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
@@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde);
 extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 /*
 * These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6f37c391468d..d245cb23dd72 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
 static const struct file_operations proc_kcore_operations = {
        .read           = read_kcore,
        .open           = open_kcore,
-        .llseek         = generic_file_llseek,
+        .llseek         = default_llseek,
 };
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                "HardwareCorrupted: %5lu kB\n"
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                "AnonHugePages:  %8lu kB\n"
+#endif
                ,
                K(i.totalram),
                K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
                K(global_page_state(NR_WRITEBACK)),
-                K(global_page_state(NR_ANON_PAGES)),
+                K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                  HPAGE_PMD_NR
+#endif
+                  ),
                K(global_page_state(NR_FILE_MAPPED)),
                K(global_page_state(NR_SHMEM)),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_MEMORY_FAILURE
                ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+                   HPAGE_PMD_NR)
+#endif
                );
        hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3b8b45660331..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
                        ppage = pfn_to_page(pfn);
                else
                        ppage = NULL;
-                if (!ppage)
+                if (!ppage || PageSlab(ppage))
                        pcount = 0;
                else
                        pcount = page_mapcount(ppage);
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
        if (PageHuge(page))
                u |= 1 << KPF_HUGE;
-        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        /*
-         * Caveats on high order pages:
+         * Caveats on high order pages: page->_count will only be set
-         * PG_buddy will only be set on the head page; SLUB/SLQB do the same
+         * -1 on the head page; SLUB/SLQB do the same for PG_slab;
-         * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+         * SLOB won't set PG_slab at all on compound pages.
         */
+        if (PageBuddy(page))
+                u |= 1 << KPF_BUDDY;
+        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
-        u |= kpf_copy_bit(k, KPF_BUDDY,         PG_buddy);
        u |= kpf_copy_bit(k, KPF_ERROR,         PG_error);
        u |= kpf_copy_bit(k, KPF_DIRTY,         PG_dirty);
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc869437..cb761f010300 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
        }
        switch (p->type) {
        case TTY_DRIVER_TYPE_SYSTEM:
-                seq_printf(m, "system");
+                seq_puts(m, "system");
                if (p->subtype == SYSTEM_TYPE_TTY)
-                        seq_printf(m, ":/dev/tty");
+                        seq_puts(m, ":/dev/tty");
                else if (p->subtype == SYSTEM_TYPE_SYSCONS)
-                        seq_printf(m, ":console");
+                        seq_puts(m, ":console");
                else if (p->subtype == SYSTEM_TYPE_CONSOLE)
-                        seq_printf(m, ":vtmaster");
+                        seq_puts(m, ":vtmaster");
                break;
        case TTY_DRIVER_TYPE_CONSOLE:
-                seq_printf(m, "console");
+                seq_puts(m, "console");
                break;
        case TTY_DRIVER_TYPE_SERIAL:
-                seq_printf(m, "serial");
+                seq_puts(m, "serial");
                break;
        case TTY_DRIVER_TYPE_PTY:
                if (p->subtype == PTY_TYPE_MASTER)
-                        seq_printf(m, "pty:master");
+                        seq_puts(m, "pty:master");
                else if (p->subtype == PTY_TYPE_SLAVE)
-                        seq_printf(m, "pty:slave");
+                        seq_puts(m, "pty:slave");
                else
-                        seq_printf(m, "pty");
+                        seq_puts(m, "pty");
                break;
        default:
                seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
                /* pseudo-drivers first */
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
-                seq_printf(m, "system:/dev/tty\n");
+                seq_puts(m, "system:/dev/tty\n");
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
-                seq_printf(m, "system:console\n");
+                seq_puts(m, "system:console\n");
 #ifdef CONFIG_UNIX98_PTYS
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
                seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
-                seq_printf(m, "system\n");
+                seq_puts(m, "system\n");
 #endif
 #ifdef CONFIG_VT
                seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
                seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
-                seq_printf(m, "system:vtmaster\n");
+                seq_puts(m, "system:vtmaster\n");
 #endif
        }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 37994737c983..62604be9f58d 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
 {
        int i, j;
-        seq_printf(p, "                    ");
+        seq_puts(p, "                    ");
        for_each_possible_cpu(i)
                seq_printf(p, "CPU%-8d", i);
-        seq_printf(p, "\n");
+        seq_putc(p, '\n');
        for (i = 0; i < NR_SOFTIRQS; i++) {
                seq_printf(p, "%12s:", softirq_to_name[i]);
                for_each_possible_cpu(j)
                        seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
-                seq_printf(p, "\n");
+                seq_putc(p, '\n');
        }
        return 0;
 }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e15a19c93bae..1cffa2b8a2fc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -126,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v)
        for (i = 0; i < NR_SOFTIRQS; i++)
                seq_printf(p, " %u", per_softirq_sums[i]);
-        seq_printf(p, "\n");
+        seq_putc(p, '\n');
        return 0;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c126c83b9a45..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm)
        return PAGE_SIZE * mm->total_vm;
 }
-int task_statm(struct mm_struct *mm, int *shared, int *text,
+unsigned long task_statm(struct mm_struct *mm,
-               int *data, int *resident)
+                         unsigned long *shared, unsigned long *text,
+                         unsigned long *data, unsigned long *resident)
 {
        *shared = get_mm_counter(mm, MM_FILEPAGES);
        *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
@@ -417,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
                   "Anonymous:      %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
-                   "MMUPageSize:    %8lu kB\n",
+                   "MMUPageSize:    %8lu kB\n"
+                   "Locked:         %8lu kB\n",
                   (vma->vm_end - vma->vm_start) >> 10,
                   mss.resident >> 10,
                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -429,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.anonymous >> 10,
                   mss.swap >> 10,
                   vma_kernel_pagesize(vma) >> 10,
-                   vma_mmu_pagesize(vma) >> 10);
+                   vma_mmu_pagesize(vma) >> 10,
+                   (vma->vm_flags & VM_LOCKED) ?
+                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index cb6306e63843..b535d3e5d5f1 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
        return vsize;
 }
-int task_statm(struct mm_struct *mm, int *shared, int *text,
+unsigned long task_statm(struct mm_struct *mm,
-               int *data, int *resident)
+                         unsigned long *shared, unsigned long *text,
+                         unsigned long *data, unsigned long *resident)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
        struct rb_node *p;
-        int size = kobjsize(mm);
+        unsigned long size = kobjsize(mm);
        down_read(&mm->mmap_sem);
        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0fed41e6efcd..a2a622e079f0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 void __quota_error(struct super_block *sb, const char *func,
-                  const char *fmt, ...)
+                   const char *fmt, ...)
 {
-        va_list args;
        if (printk_ratelimit()) {
+                va_list args;
+                struct va_format vaf;
                va_start(args, fmt);
-                printk(KERN_ERR "Quota error (device %s): %s: ",
-                       sb->s_id, func);
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
-                printk("\n");
+                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
+                       sb->s_id, func, &vaf);
                va_end(args);
        }
 }
@@ -2185,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_resume);
-int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on(struct super_block *sb, int type, int format_id,
-                      struct path *path)
+                   struct path *path)
 {
        int error = security_quota_on(path->dentry);
        if (error)
@@ -2200,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-EXPORT_SYMBOL(dquot_quota_on_path);
-int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-{
-        struct path path;
-        int error;
-        error = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (!error) {
-                error = dquot_quota_on_path(sb, type, format_id, &path);
-                path_put(&path);
-        }
-        return error;
-}
 EXPORT_SYMBOL(dquot_quota_on);
 /*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1edb..b34bdb25490c 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
 }
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
-                         void __user *addr)
+                         struct path *path)
 {
-        char *pathname;
+        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
-        int ret = -ENOSYS;
+                return -ENOSYS;
+        if (sb->s_qcop->quota_on_meta)
-        pathname = getname(addr);
+                return sb->s_qcop->quota_on_meta(sb, type, id);
-        if (IS_ERR(pathname))
+        if (IS_ERR(path))
-                return PTR_ERR(pathname);
+                return PTR_ERR(path);
-        if (sb->s_qcop->quota_on)
+        return sb->s_qcop->quota_on(sb, type, id, path);
-                ret = sb->s_qcop->quota_on(sb, type, id, pathname);
-        putname(pathname);
-        return ret;
 }
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 /* Copy parameters and call proper function */
 static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
-                       void __user *addr)
+                       void __user *addr, struct path *path)
 {
        int ret;
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        switch (cmd) {
        case Q_QUOTAON:
-                return quota_quotaon(sb, type, cmd, id, addr);
+                return quota_quotaon(sb, type, cmd, id, path);
        case Q_QUOTAOFF:
                if (!sb->s_qcop->quota_off)
                        return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 {
        uint cmds, type;
        struct super_block *sb = NULL;
+        struct path path, *pathp = NULL;
        int ret;
        cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
                return -ENODEV;
        }
+        /*
+         * Path for quotaon has to be resolved before grabbing superblock
+         * because that gets s_umount sem which is also possibly needed by path
+         * resolution (think about autofs) and thus deadlocks could arise.
+         */
+        if (cmds == Q_QUOTAON) {
+                ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
+                if (ret)
+                        pathp = ERR_PTR(ret);
+                else
+                        pathp = &path;
+        }
        sb = quotactl_block(special);
        if (IS_ERR(sb))
                return PTR_ERR(sb);
-        ret = do_quotactl(sb, type, cmds, id, addr);
+        ret = do_quotactl(sb, type, cmds, id, addr, pathp);
        drop_super(sb);
+        if (pathp && !IS_ERR(pathp))
+                path_put(pathp);
        return ret;
 }
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabcc..e41c1becf096 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                return -ENOMEM;
        ret = read_blk(info, *blk, buf);
        if (ret < 0) {
-                quota_error(dquot->dq_sb, "Can't read quota data "
+                quota_error(dquot->dq_sb, "Can't read quota data block %u",
-                            "block %u", blk);
+                            *blk);
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                } else {
                        ret = write_blk(info, *blk, buf);
                        if (ret < 0)
-                                quota_error(dquot->dq_sb, "Can't write quota "
+                                quota_error(dquot->dq_sb,
-                                            "tree block %u", blk);
+                                            "Can't write quota tree block %u",
+                                            *blk);
                }
        }
 out_buf:
diff --git a/fs/read_write.c b/fs/read_write.c
index 5d431bacbea9..5520f8ad5504 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -30,18 +30,9 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
-static int
+static inline int unsigned_offsets(struct file *file)
-__negative_fpos_check(struct file *file, loff_t pos, size_t count)
 {
-        /*
+        return file->f_mode & FMODE_UNSIGNED_OFFSET;
-         * pos or pos+count is negative here, check overflow.
-         * too big "count" will be caught in rw_verify_area().
-         */
-        if ((pos < 0) && (pos + count < pos))
-                return -EOVERFLOW;
-        if (file->f_mode & FMODE_UNSIGNED_OFFSET)
-                return 0;
-        return -EINVAL;
 }
 /**
@@ -75,7 +66,7 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                break;
        }
-        if (offset < 0 && __negative_fpos_check(file, offset, 0))
+        if (offset < 0 && !unsigned_offsets(file))
                return -EINVAL;
        if (offset > inode->i_sb->s_maxbytes)
                return -EINVAL;
@@ -152,7 +143,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                        offset += file->f_pos;
        }
        retval = -EINVAL;
-        if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
+        if (offset >= 0 || unsigned_offsets(file)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
@@ -252,9 +243,13 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
        if (unlikely((ssize_t) count < 0))
                return retval;
        pos = *ppos;
-        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
+        if (unlikely(pos < 0)) {
-                retval = __negative_fpos_check(file, pos, count);
+                if (!unsigned_offsets(file))
-                if (retval)
+                        return retval;
+                if (count >= -pos) /* both values are in 0..LLONG_MAX */
+                        return -EOVERFLOW;
+        } else if (unlikely((loff_t) (pos + count) < 0)) {
+                if (!unsigned_offsets(file))
                        return retval;
        }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index d31bce1a9f90..3eea859e6990 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2551,8 +2551,6 @@ static int release_journal_dev(struct super_block *super,
        result = 0;
        if (journal->j_dev_bd != NULL) {
-                if (journal->j_dev_bd->bd_dev != super->s_dev)
-                        bd_release(journal->j_dev_bd);
                result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
                journal->j_dev_bd = NULL;
        }
@@ -2570,7 +2568,7 @@ static int journal_init_dev(struct super_block *super,
 {
        int result;
        dev_t jdev;
-        fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
+        fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
        char b[BDEVNAME_SIZE];
        result = 0;
@@ -2584,7 +2582,10 @@ static int journal_init_dev(struct super_block *super,
        /* there is no "jdev" option and journal is on separate device */
        if ((!jdev_name || !jdev_name[0])) {
-                journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+                if (jdev == super->s_dev)
+                        blkdev_mode &= ~FMODE_EXCL;
+                journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+                                                      journal);
                journal->j_dev_mode = blkdev_mode;
                if (IS_ERR(journal->j_dev_bd)) {
                        result = PTR_ERR(journal->j_dev_bd);
@@ -2593,22 +2594,14 @@ static int journal_init_dev(struct super_block *super,
                                         "cannot init journal device '%s': %i",
                                         __bdevname(jdev, b), result);
                        return result;
-                } else if (jdev != super->s_dev) {
+                } else if (jdev != super->s_dev)
-                        result = bd_claim(journal->j_dev_bd, journal);
-                        if (result) {
-                                blkdev_put(journal->j_dev_bd, blkdev_mode);
-                                return result;
-                        }
                        set_blocksize(journal->j_dev_bd, super->s_blocksize);
-                }
                return 0;
        }
        journal->j_dev_mode = blkdev_mode;
-        journal->j_dev_bd = open_bdev_exclusive(jdev_name,
+        journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
-                                                blkdev_mode, journal);
        if (IS_ERR(journal->j_dev_bd)) {
                result = PTR_ERR(journal->j_dev_bd);
                journal->j_dev_bd = NULL;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index adbc6f538515..45de98b59466 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...)	//int print_mode, int first, int l
        va_list args;
        int mode, first, last;
-        va_start(args, bh);
        if (!bh) {
                printk("print_block: buffer is NULL\n");
                return;
        }
+        va_start(args, bh);
        mode = va_arg(args, int);
        first = va_arg(args, int);
        last = va_arg(args, int);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2575682a9ead..0aab04f46827 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -632,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *);
+static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
 static const struct dquot_operations reiserfs_quota_operations = {
        .write_dquot = reiserfs_write_dquot,
@@ -2048,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 * Standard function to be called on quota_on
 */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-                             char *name)
+                             struct path *path)
 {
        int err;
-        struct path path;
        struct inode *inode;
        struct reiserfs_transaction_handle th;
        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
                return -EINVAL;
-        err = kern_path(name, LOOKUP_FOLLOW, &path);
-        if (err)
-                return err;
        /* Quotafile not on the same filesystem? */
-        if (path.mnt->mnt_sb != sb) {
+        if (path->mnt->mnt_sb != sb) {
                err = -EXDEV;
                goto out;
        }
-        inode = path.dentry->d_inode;
+        inode = path->dentry->d_inode;
        /* We must not pack tails for quota files on reiserfs for quota IO to work */
        if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
                err = reiserfs_unpack(inode, NULL);
@@ -2082,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        /* Journaling quota? */
        if (REISERFS_SB(sb)->s_qf_names[type]) {
                /* Quotafile not of fs root? */
-                if (path.dentry->d_parent != sb->s_root)
+                if (path->dentry->d_parent != sb->s_root)
                        reiserfs_warning(sb, "super-6521",
                                 "Quota file not on filesystem root. "
                                 "Journalled quota will not work.");
@@ -2101,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                if (err)
                        goto out;
        }
-        err = dquot_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on(sb, type, format_id, path);
 out:
-        path_put(&path);
        return err;
 }
diff --git a/fs/select.c b/fs/select.c
index b7b10aa30861..e56560d2b08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
                rts.tv_sec = rts.tv_nsec = 0;
        if (timeval) {
+                if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+                        memset(&rtv, 0, sizeof(rtv));
                rtv.tv_sec = rts.tv_sec;
                rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
diff --git a/fs/splice.c b/fs/splice.c
index ce2f02579e35..50a5d978da16 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 {
        struct file *file = sd->u.file;
        loff_t pos = sd->pos;
-        int ret, more;
+        int more;
-        ret = buf->ops->confirm(pipe, buf);
-        if (!ret) {
-                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
-                if (file->f_op && file->f_op->sendpage)
-                        ret = file->f_op->sendpage(file, buf->page, buf->offset,
-                                                   sd->len, &pos, more);
-                else
-                        ret = -EINVAL;
-        }
-        return ret;
+        if (!likely(file->f_op && file->f_op->sendpage))
+                return -EINVAL;
+        more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+        return file->f_op->sendpage(file, buf->page, buf->offset,
+                                    sd->len, &pos, more);
 }
 /*
@@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        void *fsdata;
        int ret;
-        /*
-         * make sure the data in this buffer is uptodate
-         */
-        ret = buf->ops->confirm(pipe, buf);
-        if (unlikely(ret))
-                return ret;
        offset = sd->pos & ~PAGE_CACHE_MASK;
        this_len = sd->len;
@@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                if (sd->len > sd->total_len)
                        sd->len = sd->total_len;
-                ret = actor(pipe, buf, sd);
+                ret = buf->ops->confirm(pipe, buf);
-                if (ret <= 0) {
+                if (unlikely(ret)) {
                        if (ret == -ENODATA)
                                ret = 0;
                        return ret;
                }
+                ret = actor(pipe, buf, sd);
+                if (ret <= 0)
+                        return ret;
                buf->offset += ret;
                buf->len -= ret;
@@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        int ret;
        void *data;
-        ret = buf->ops->confirm(pipe, buf);
-        if (ret)
-                return ret;
        data = buf->ops->map(pipe, buf, 0);
        ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
        buf->ops->unmap(pipe, buf, data);
@@ -1495,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        char *src;
        int ret;
-        ret = buf->ops->confirm(pipe, buf);
-        if (unlikely(ret))
-                return ret;
        /*
         * See if we can use the atomic maps, by prefaulting in the
         * pages and doing an atomic copy
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d04..aa68a8a31518 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
 config SQUASHFS_XATTR
        bool "Squashfs XATTR support"
        depends on SQUASHFS
-        default n
        help
          Saying Y here includes support for extended attributes (xattrs).
          Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
 config SQUASHFS_LZO
        bool "Include support for LZO compressed file systems"
        depends on SQUASHFS
-        default n
        select LZO_DECOMPRESS
        help
          Saying Y here includes support for reading Squashfs file systems
@@ -53,10 +51,24 @@ config SQUASHFS_LZO
          If unsure, say N.
+config SQUASHFS_XZ
+        bool "Include support for XZ compressed file systems"
+        depends on SQUASHFS
+        select XZ_DEC
+        help
+          Saying Y here includes support for reading Squashfs file systems
+          compressed with XZ compresssion.  XZ gives better compression than
+          the default zlib compression, at the expense of greater CPU and
+          memory overhead.
+          XZ is not the standard compression used in Squashfs and so most
+          file systems will be readable without selecting this option.
+          If unsure, say N.
 config SQUASHFS_EMBEDDED
        bool "Additional option for memory-constrained systems"
        depends on SQUASHFS
-        default n
        help
          Saying Y here allows you to specify cache size.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d328..cecf2bea07af 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb840..2fb2882f0fa7 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee9059..26b15ae34d6f 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 /*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722f..a5940e54c4dd 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "decompressor.h"
 #include "squashfs.h"
@@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
 };
 #ifndef CONFIG_SQUASHFS_LZO
-static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
        NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
 #endif
+#ifndef CONFIG_SQUASHFS_XZ
+static const struct squashfs_decompressor squashfs_xz_comp_ops = {
+        NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+};
+#endif
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
        NULL, NULL, NULL, 0, "unknown", 0
 };
 static const struct squashfs_decompressor *decompressor[] = {
        &squashfs_zlib_comp_ops,
-        &squashfs_lzma_unsupported_comp_ops,
-#ifdef CONFIG_SQUASHFS_LZO
        &squashfs_lzo_comp_ops,
-#else
+        &squashfs_xz_comp_ops,
-        &squashfs_lzo_unsupported_comp_ops,
+        &squashfs_lzma_unsupported_comp_ops,
-#endif
        &squashfs_unknown_comp_ops
 };
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f6..3b305a70f7aa 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
        return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
                length, srclength, pages);
 }
+#ifdef CONFIG_SQUASHFS_XZ
+extern const struct squashfs_decompressor squashfs_xz_comp_ops;
+#endif
+#ifdef CONFIG_SQUASHFS_LZO
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
+#endif
 #endif
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879d..7eef571443c6 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 /*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b70..d8f32452638e 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 /*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c1..7da759e34c52 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f72..ba729d808876 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
 #define WARNING(s, args...)     pr_warning("SQUASHFS: "s, ## args)
-static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
-{
-        return list_entry(inode, struct squashfs_inode_info, vfs_inode);
-}
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
                                int, int);
@@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
-/* lzo_wrapper.c */
-extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab11..39533feffd6d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
 #define ZLIB_COMPRESSION        1
 #define LZMA_COMPRESSION        2
 #define LZO_COMPRESSION         3
+#define XZ_COMPRESSION          4
 struct squashfs_super_block {
        __le32                  s_magic;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a1..359baefc01fc 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
        };
        struct inode    vfs_inode;
 };
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+        return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
 #endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d33be5dd6c32..05385dbe1465 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,7 +32,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "xattr.h"
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 000000000000..856756ca5ee4
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,153 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xz_wrapper.c
+ */
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/xz.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+struct squashfs_xz {
+        struct xz_dec *state;
+        struct xz_buf buf;
+};
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
+{
+        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+        struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+        if (stream == NULL)
+                goto failed;
+        stream->state = xz_dec_init(XZ_PREALLOC, block_size);
+        if (stream->state == NULL)
+                goto failed;
+        return stream;
+failed:
+        ERROR("Failed to allocate xz workspace\n");
+        kfree(stream);
+        return NULL;
+}
+static void squashfs_xz_free(void *strm)
+{
+        struct squashfs_xz *stream = strm;
+        if (stream) {
+                xz_dec_end(stream->state);
+                kfree(stream);
+        }
+}
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+        struct buffer_head **bh, int b, int offset, int length, int srclength,
+        int pages)
+{
+        enum xz_ret xz_err;
+        int avail, total = 0, k = 0, page = 0;
+        struct squashfs_xz *stream = msblk->stream;
+        mutex_lock(&msblk->read_data_mutex);
+        xz_dec_reset(stream->state);
+        stream->buf.in_pos = 0;
+        stream->buf.in_size = 0;
+        stream->buf.out_pos = 0;
+        stream->buf.out_size = PAGE_CACHE_SIZE;
+        stream->buf.out = buffer[page++];
+        do {
+                if (stream->buf.in_pos == stream->buf.in_size && k < b) {
+                        avail = min(length, msblk->devblksize - offset);
+                        length -= avail;
+                        wait_on_buffer(bh[k]);
+                        if (!buffer_uptodate(bh[k]))
+                                goto release_mutex;
+                        if (avail == 0) {
+                                offset = 0;
+                                put_bh(bh[k++]);
+                                continue;
+                        }
+                        stream->buf.in = bh[k]->b_data + offset;
+                        stream->buf.in_size = avail;
+                        stream->buf.in_pos = 0;
+                        offset = 0;
+                }
+                if (stream->buf.out_pos == stream->buf.out_size
+                                                        && page < pages) {
+                        stream->buf.out = buffer[page++];
+                        stream->buf.out_pos = 0;
+                        total += PAGE_CACHE_SIZE;
+                }
+                xz_err = xz_dec_run(stream->state, &stream->buf);
+                if (stream->buf.in_pos == stream->buf.in_size && k < b)
+                        put_bh(bh[k++]);
+        } while (xz_err == XZ_OK);
+        if (xz_err != XZ_STREAM_END) {
+                ERROR("xz_dec_run error, data probably corrupt\n");
+                goto release_mutex;
+        }
+        if (k < b) {
+                ERROR("xz_uncompress error, input remaining\n");
+                goto release_mutex;
+        }
+        total += stream->buf.out_pos;
+        mutex_unlock(&msblk->read_data_mutex);
+        return total;
+release_mutex:
+        mutex_unlock(&msblk->read_data_mutex);
+        for (; k < b; k++)
+                put_bh(bh[k]);
+        return -EIO;
+}
+const struct squashfs_decompressor squashfs_xz_comp_ops = {
+        .init = squashfs_xz_init,
+        .free = squashfs_xz_free,
+        .decompress = squashfs_xz_uncompress,
+        .id = XZ_COMPRESSION,
+        .name = "xz",
+        .supported = 1
+};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e483..818a5e063faf 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
@@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
        struct buffer_head **bh, int b, int offset, int length, int srclength,
        int pages)
 {
-        int zlib_err = 0, zlib_init = 0;
+        int zlib_err, zlib_init = 0;
-        int avail, bytes, k = 0, page = 0;
+        int k = 0, page = 0;
        z_stream *stream = msblk->stream;
        mutex_lock(&msblk->read_data_mutex);
@@ -75,11 +74,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
        stream->avail_out = 0;
        stream->avail_in = 0;
-        bytes = length;
        do {
                if (stream->avail_in == 0 && k < b) {
-                        avail = min(bytes, msblk->devblksize - offset);
+                        int avail = min(length, msblk->devblksize - offset);
-                        bytes -= avail;
+                        length -= avail;
                        wait_on_buffer(bh[k]);
                        if (!buffer_uptodate(bh[k]))
                                goto release_mutex;
@@ -128,6 +126,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                goto release_mutex;
        }
+        if (k < b) {
+                ERROR("zlib_uncompress error, data remaining\n");
+                goto release_mutex;
+        }
        length = stream->total_out;
        mutex_unlock(&msblk->read_data_mutex);
        return length;
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e213900..d5c61cf2b703 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
        int error = -EINVAL;
        int lookup_flags = 0;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
                goto out;
        if (!(flag & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
+        if (flag & AT_NO_AUTOMOUNT)
+                lookup_flags |= LOOKUP_NO_AUTOMOUNT;
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
diff --git a/fs/super.c b/fs/super.c
index 823e061faa87..74e149efed81 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -767,13 +767,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 {
        struct block_device *bdev;
        struct super_block *s;
-        fmode_t mode = FMODE_READ;
+        fmode_t mode = FMODE_READ | FMODE_EXCL;
        int error = 0;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
-        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
                return ERR_CAST(bdev);
@@ -802,13 +802,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
                /*
                 * s_umount nests inside bd_mutex during
-                 * __invalidate_device().  close_bdev_exclusive()
+                 * __invalidate_device().  blkdev_put() acquires
-                 * acquires bd_mutex and can't be called under
+                 * bd_mutex and can't be called under s_umount.  Drop
-                 * s_umount.  Drop s_umount temporarily.  This is safe
+                 * s_umount temporarily.  This is safe as we're
-                 * as we're holding an active reference.
+                 * holding an active reference.
                 */
                up_write(&s->s_umount);
-                close_bdev_exclusive(bdev, mode);
+                blkdev_put(bdev, mode);
                down_write(&s->s_umount);
        } else {
                char b[BDEVNAME_SIZE];
@@ -832,7 +832,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 error_s:
        error = PTR_ERR(s);
 error_bdev:
-        close_bdev_exclusive(bdev, mode);
+        blkdev_put(bdev, mode);
 error:
        return ERR_PTR(error);
 }
@@ -863,7 +863,8 @@ void kill_block_super(struct super_block *sb)
        bdev->bd_super = NULL;
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
-        close_bdev_exclusive(bdev, mode);
+        WARN_ON_ONCE(!(mode & FMODE_EXCL));
+        blkdev_put(bdev, mode | FMODE_EXCL);
 }
 EXPORT_SYMBOL(kill_block_super);
@@ -1140,7 +1141,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
        return mnt;
 err:
-        mntput_long(mnt);
+        mntput(mnt);
        return ERR_PTR(err);
 }
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d6..8c41feacbac5 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
 config SYSFS
-        bool "sysfs file system support" if EMBEDDED
+        bool "sysfs file system support" if EXPERT
        default y
        help
        The sysfs filesystem is a virtual filesystem that the kernel uses to
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 442f34ff1af8..c8769dc222d8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -165,10 +165,7 @@ int sysfs_merge_group(struct kobject *kobj,
        struct attribute *const *attr;
        int i;
-        if (grp)
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-                dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-        else
-                dir_sd = sysfs_get(kobj->sd);
        if (!dir_sd)
                return -ENOENT;
@@ -195,10 +192,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
        struct sysfs_dirent *dir_sd;
        struct attribute *const *attr;
-        if (grp)
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-                dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-        else
-                dir_sd = sysfs_get(kobj->sd);
        if (dir_sd) {
                for (attr = grp->attrs; *attr; ++attr)
                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 30ac27345586..0a12eb89cd32 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sysfs.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ffaaa816bfba..3d28af31d863 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
 */
 #include <linux/lockdep.h>
+#include <linux/kobject_ns.h>
 #include <linux/fs.h>
 struct sysfs_open_dirent;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b5e68da2db32..b427b1208c26 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -48,7 +48,6 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        d_set_d_op(dentry, dir->i_sb->s_root->d_op);
        if (dentry->d_name.len > SYSV_NAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
        ino = sysv_inode_by_name(dentry);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 76712aefc4ab..f60c196913ea 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -332,6 +332,10 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
        sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
        /* set up enough so that it can read an inode */
        sb->s_op = &sysv_sops;
+        if (sbi->s_forced_ro)
+                sb->s_flags |= MS_RDONLY;
+        if (sbi->s_truncate)
+                sb->s_d_op = &sysv_dentry_operations;
        root_inode = sysv_iget(sb, SYSV_ROOT_INO);
        if (IS_ERR(root_inode)) {
                printk("SysV FS: get root inode failed\n");
@@ -343,10 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
                printk("SysV FS: get root dentry failed\n");
                return 0;
        }
-        if (sbi->s_forced_ro)
-                sb->s_flags |= MS_RDONLY;
-        if (sbi->s_truncate)
-                d_set_d_op(sb->s_root, &sysv_dentry_operations);
        return 1;
 }
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index f8def3c8ea4c..0e0e99bd6bce 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,5 @@
 config UDF_FS
        tristate "UDF file system support"
-        depends on BKL # needs serious work to remove
        select CRC_ITU_T
        help
          This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b608efaa4cee..306ee39ef2c3 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                udf_debug("bit %ld already set\n", bit + i);
                                udf_debug("byte=%2x\n",
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
-                        } else {
-                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
+                udf_add_free_space(sb, sbi->s_partition, count);
                mark_buffer_dirty(bh);
                if (overflow) {
                        block += count;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 51552bf50225..eb8bfe2b89a5 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode *dir = filp->f_path.dentry->d_inode;
        int result;
-        lock_kernel();
        if (filp->f_pos == 0) {
                if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-                        unlock_kernel();
                        return 0;
                }
                filp->f_pos++;
        }
        result = do_udf_readdir(dir, filp, filldir, dirent);
-        unlock_kernel();
        return result;
 }
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 66b9e7e7e4c5..89c78486cbbe 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -32,7 +32,6 @@
 #include <linux/string.h> /* memset */
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
@@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        size_t count = iocb->ki_left;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                if (file->f_flags & O_APPEND)
                        pos = inode->i_size;
@@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                        udf_expand_file_adinicb(inode, pos + count, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
+                                up_write(&iinfo->i_data_sem);
                                return err;
                        }
                } else {
@@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                iinfo->i_lenAlloc = inode->i_size;
                }
        }
+        up_write(&iinfo->i_data_sem);
        retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
        if (retval > 0)
@@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        long old_block, new_block;
        int result = -EINVAL;
-        lock_kernel();
        if (file_permission(filp, MAY_READ) != 0) {
                udf_debug("no permission to access inode %lu\n", inode->i_ino);
                result = -EPERM;
@@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        }
 out:
-        unlock_kernel();
        return result;
 }
@@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->f_mode & FMODE_WRITE) {
                mutex_lock(&inode->i_mutex);
-                lock_kernel();
+                down_write(&UDF_I(inode)->i_data_sem);
                udf_discard_prealloc(inode);
                udf_truncate_tail_extent(inode);
-                unlock_kernel();
+                up_write(&UDF_I(inode)->i_data_sem);
                mutex_unlock(&inode->i_mutex);
        }
        return 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 75d9304d0dc3..6fb7e0adcda0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                return NULL;
        }
-        mutex_lock(&sbi->s_alloc_mutex);
        if (sbi->s_lvid_bh) {
-                struct logicalVolIntegrityDesc *lvid =
+                struct logicalVolIntegrityDescImpUse *lvidiu;
-                        (struct logicalVolIntegrityDesc *)
-                        sbi->s_lvid_bh->b_data;
+                iinfo->i_unique = lvid_get_unique_id(sb);
-                struct logicalVolIntegrityDescImpUse *lvidiu =
+                mutex_lock(&sbi->s_alloc_mutex);
-                                                        udf_sb_lvidiu(sbi);
+                lvidiu = udf_sb_lvidiu(sbi);
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
                if (S_ISDIR(mode))
                        le32_add_cpu(&lvidiu->numDirs, 1);
                else
                        le32_add_cpu(&lvidiu->numFiles, 1);
-                iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
                udf_updated_lvid(sb);
+                mutex_unlock(&sbi->s_alloc_mutex);
        }
-        mutex_unlock(&sbi->s_alloc_mutex);
        inode_init_owner(inode, dir, mode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fc48f37aa2dd..c6a2e782b97b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -31,7 +31,6 @@
 #include "udfdecl.h"
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
@@ -51,6 +50,7 @@ MODULE_LICENSE("GPL");
 static mode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
+static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
                                        sector_t *, int *);
@@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode)
                want_delete = 1;
                inode->i_size = 0;
                udf_truncate(inode);
-                lock_kernel();
                udf_update_inode(inode, IS_SYNC(inode));
-                unlock_kernel();
        }
        invalidate_inode_buffers(inode);
        end_writeback(inode);
@@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode)
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
        if (want_delete) {
-                lock_kernel();
                udf_free_inode(inode);
-                unlock_kernel();
        }
 }
@@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block,
        err = -EIO;
        new = 0;
        bh = NULL;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (block == iinfo->i_next_alloc_block + 1) {
                iinfo->i_next_alloc_block++;
                iinfo->i_next_alloc_goal++;
@@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
        map_bh(bh_result, inode->i_sb, phys);
 abort:
-        unlock_kernel();
+        up_write(&iinfo->i_data_sem);
        return err;
 }
@@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode)
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        lock_kernel();
        iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                down_write(&iinfo->i_data_sem);
                if (inode->i_sb->s_blocksize <
                                (udf_file_entry_alloc_offset(inode) +
                                 inode->i_size)) {
                        udf_expand_file_adinicb(inode, inode->i_size, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                inode->i_size = iinfo->i_lenAlloc;
-                                unlock_kernel();
+                                up_write(&iinfo->i_data_sem);
                                return;
                        } else
                                udf_truncate_extents(inode);
@@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode)
                                offset - udf_file_entry_alloc_offset(inode));
                        iinfo->i_lenAlloc = inode->i_size;
                }
+                up_write(&iinfo->i_data_sem);
        } else {
                block_truncate_page(inode->i_mapping, inode->i_size,
                                    udf_get_block);
+                down_write(&iinfo->i_data_sem);
                udf_truncate_extents(inode);
+                up_write(&iinfo->i_data_sem);
        }
        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
@@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode)
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
-        unlock_kernel();
 }
 static void __udf_read_inode(struct inode *inode)
@@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                return;
        }
+        read_lock(&sbi->s_cred_lock);
        inode->i_uid = le32_to_cpu(fe->uid);
        if (inode->i_uid == -1 ||
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
@@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
                inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
-        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
-        if (!inode->i_nlink)
-                inode->i_nlink = 1;
-        inode->i_size = le64_to_cpu(fe->informationLength);
-        iinfo->i_lenExtents = inode->i_size;
        if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
                        sbi->s_fmode != UDF_INVALID_MODE)
                inode->i_mode = sbi->s_fmode;
@@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        else
                inode->i_mode = udf_convert_permissions(fe);
        inode->i_mode &= ~sbi->s_umask;
+        read_unlock(&sbi->s_cred_lock);
+        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
+        if (!inode->i_nlink)
+                inode->i_nlink = 1;
+        inode->i_size = le64_to_cpu(fe->informationLength);
+        iinfo->i_lenExtents = inode->i_size;
        if (iinfo->i_efe == 0) {
                inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
 int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        int ret;
+        return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        lock_kernel();
-        ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
-        return ret;
 }
-int udf_sync_inode(struct inode *inode)
+static int udf_sync_inode(struct inode *inode)
 {
        return udf_update_inode(inode, 1);
 }
@@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        struct extent_position epos = {};
        int ret;
-        lock_kernel();
+        down_read(&UDF_I(inode)->i_data_sem);
        if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
                                                (EXT_RECORDED_ALLOCATED >> 30))
@@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        else
                ret = 0;
-        unlock_kernel();
+        up_read(&UDF_I(inode)->i_data_sem);
        brelse(epos.bh);
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6d8dc02baebb..2be0f9eb86d2 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/crc-itu-t.h>
@@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                }
                if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
-                    isdotdot) {
+                    isdotdot)
-                        brelse(epos.bh);
+                        goto out_ok;
-                        return fi;
-                }
                if (!lfi)
                        continue;
@@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
        if (dentry->d_name.len > UDF_NAME_LEN - 2)
                return ERR_PTR(-ENAMETOOLONG);
-        lock_kernel();
 #ifdef UDF_RECOVERY
        /* temporary shorthand for specifying files by inode number */
        if (!strncmp(dentry->d_name.name, ".B=", 3)) {
@@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                };
                inode = udf_iget(dir->i_sb, lb);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        } else
@@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                loc = lelb_to_cpu(cfi.icb.extLocation);
                inode = udf_iget(dir->i_sb, &loc);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        }
-        unlock_kernel();
        return d_splice_alias(inode, dentry);
 }
@@ -476,15 +469,19 @@ add:
                                f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
                if (!fibh->ebh)
                        goto out_err;
+                /* Extents could have been merged, invalidate our position */
+                brelse(epos.bh);
+                epos.bh = NULL;
+                epos.block = dinfo->i_location;
+                epos.offset = udf_file_entry_alloc_offset(dir);
                if (!fibh->soffset) {
-                        if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+                        /* Find the freshly allocated block */
-                            (EXT_RECORDED_ALLOCATED >> 30)) {
+                        while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
-                                block = eloc.logicalBlockNum + ((elen - 1) >>
+                                (EXT_RECORDED_ALLOCATED >> 30))
+                                ;
+                        block = eloc.logicalBlockNum + ((elen - 1) >>
                                        dir->i_sb->s_blocksize_bits);
-                        } else
-                                block++;
                        brelse(fibh->sbh);
                        fibh->sbh = fibh->ebh;
                        fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
@@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
-        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
-                unlock_kernel();
                return err;
        }
@@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
-        unlock_kernel();
        d_instantiate(dentry, inode);
        return 0;
@@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
        if (!inode)
@@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
                goto out;
@@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -826,7 +813,6 @@ end_rmdir:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -870,7 +855,6 @@ end_unlink:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        int block;
        unsigned char *name = NULL;
        int namelen;
-        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
+        struct super_block *sb = dir->i_sb;
-        lock_kernel();
        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
                goto out;
+        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
        if (!name) {
                err = -ENOMEM;
                goto out_no_entry;
        }
-        iinfo = UDF_I(inode);
        inode->i_data.a_ops = &udf_symlink_aops;
        inode->i_op = &udf_symlink_inode_operations;
@@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                struct kernel_lb_addr eloc;
                uint32_t bsize;
-                block = udf_new_block(inode->i_sb, inode,
+                block = udf_new_block(sb, inode,
                                iinfo->i_location.partitionReferenceNum,
                                iinfo->i_location.logicalBlockNum, &err);
                if (!block)
@@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                eloc.logicalBlockNum = block;
                eloc.partitionReferenceNum =
                                iinfo->i_location.partitionReferenceNum;
-                bsize = inode->i_sb->s_blocksize;
+                bsize = sb->s_blocksize;
                iinfo->i_lenExtents = bsize;
                udf_add_aext(inode, &epos, &eloc, bsize, 0);
                brelse(epos.bh);
-                block = udf_get_pblock(inode->i_sb, block,
+                block = udf_get_pblock(sb, block,
                                iinfo->i_location.partitionReferenceNum,
                                0);
-                epos.bh = udf_tgetblk(inode->i_sb, block);
+                epos.bh = udf_tgetblk(sb, block);
                lock_buffer(epos.bh);
-                memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
+                memset(epos.bh->b_data, 0x00, bsize);
                set_buffer_uptodate(epos.bh);
                unlock_buffer(epos.bh);
                mark_buffer_dirty_inode(epos.bh, inode);
@@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        } else
                ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
-        eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode);
+        eoffset = sb->s_blocksize - udf_ext0_offset(inode);
        pc = (struct pathComponent *)ea;
        if (*symname == '/') {
@@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                }
                if (pc->componentType == 5) {
-                        namelen = udf_put_filename(inode->i_sb, compstart, name,
+                        namelen = udf_put_filename(sb, compstart, name,
                                                   symname - compstart);
                        if (!namelen)
                                goto out_no_entry;
@@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi)
                goto out_no_entry;
-        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+        cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                lvid->logicalVolContentsUse;
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                mark_inode_dirty(dir);
+        up_write(&iinfo->i_data_sem);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
@@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 out:
        kfree(name);
-        unlock_kernel();
        return err;
 out_no_entry:
+        up_write(&iinfo->i_data_sem);
        inode_dec_link_count(inode);
        iput(inode);
        goto out;
@@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        struct udf_fileident_bh fibh;
        struct fileIdentDesc cfi, *fi;
        int err;
-        struct buffer_head *bh;
-        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
-                unlock_kernel();
                return -EMLINK;
        }
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(inode->i_sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
@@ -1103,7 +1060,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        mark_inode_dirty(inode);
        ihold(inode);
        d_instantiate(dentry, inode);
-        unlock_kernel();
        return 0;
 }
@@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
                if (ofibh.sbh != ofibh.ebh)
@@ -1248,7 +1203,6 @@ end_rename:
                        brelse(nfibh.ebh);
                brelse(nfibh.sbh);
        }
-        unlock_kernel();
        return retval;
 }
@@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child)
        struct fileIdentDesc cfi;
        struct udf_fileident_bh fibh;
-        lock_kernel();
        if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
                goto out_unlock;
@@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child)
        inode = udf_iget(child->d_inode->i_sb, &tloc);
        if (!inode)
                goto out_unlock;
-        unlock_kernel();
        return d_obtain_alias(inode);
 out_unlock:
-        unlock_kernel();
        return ERR_PTR(-EACCES);
 }
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 745eb209be0c..a71090ea0e07 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/mutex.h>
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
                        uint16_t partition, uint32_t offset)
@@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
        struct udf_sb_info *sbi = UDF_SB(sb);
        u16 reallocationTableLen;
        struct buffer_head *bh;
+        int ret = 0;
+        mutex_lock(&sbi->s_alloc_mutex);
        for (i = 0; i < sbi->s_partitions; i++) {
                struct udf_part_map *map = &sbi->s_partmaps[i];
                if (old_block > map->s_partition_root &&
@@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                        break;
                                }
-                        if (!st)
+                        if (!st) {
-                                return 1;
+                                ret = 1;
+                                goto out;
+                        }
                        reallocationTableLen =
                                        le16_to_cpu(st->reallocationTableLen);
@@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc == packet) {
                                        *new_block = le32_to_cpu(
                                                        entry->mappedLocation) +
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc > packet)
                                        break;
                        }
@@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                              st->mapEntry[k].mappedLocation) +
                                        ((old_block - map->s_partition_root) &
                                         (sdata->s_packet_len - 1));
-                                return 0;
+                                ret = 0;
+                                goto out;
                        }
-                        return 1;
+                        ret = 1;
+                        goto out;
                } /* if old_block */
        }
        if (i == sbi->s_partitions) {
                /* outside of partitions */
                /* for now, fail =) */
-                return 1;
+                ret = 1;
        }
-        return 0;
+out:
+        mutex_unlock(&sbi->s_alloc_mutex);
+        return ret;
 }
 static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b539d53320fb..7b27b063ff6d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
 #include <linux/stat.h>
 #include <linux/cdrom.h>
 #include <linux/nls.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/vmalloc.h>
@@ -135,6 +134,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
        ei->i_next_alloc_block = 0;
        ei->i_next_alloc_goal = 0;
        ei->i_strat4096 = 0;
+        init_rwsem(&ei->i_data_sem);
        return &ei->vfs_inode;
 }
@@ -574,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
-        lock_kernel();
+        write_lock(&sbi->s_cred_lock);
        sbi->s_flags = uopt.flags;
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
        sbi->s_umask = uopt.umask;
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
+        write_unlock(&sbi->s_cred_lock);
        if (sbi->s_lvid_bh) {
                int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -597,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                udf_open_lvid(sb);
 out_unlock:
-        unlock_kernel();
        return error;
 }
@@ -966,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                (sizeof(struct buffer_head *) * nr_groups);
        if (size <= PAGE_SIZE)
-                bitmap = kmalloc(size, GFP_KERNEL);
+                bitmap = kzalloc(size, GFP_KERNEL);
        else
-                bitmap = vmalloc(size); /* TODO: get rid of vmalloc */
+                bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
        if (bitmap == NULL) {
                udf_error(sb, __func__,
@@ -977,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                return NULL;
        }
-        memset(bitmap, 0x00, size);
        bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
        bitmap->s_nr_groups = nr_groups;
        return bitmap;
@@ -1781,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
@@ -1797,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
 }
 static void udf_close_lvid(struct super_block *sb)
@@ -1809,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1829,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
+}
+u64 lvid_get_unique_id(struct super_block *sb)
+{
+        struct buffer_head *bh;
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        struct logicalVolIntegrityDesc *lvid;
+        struct logicalVolHeaderDesc *lvhd;
+        u64 uniqueID;
+        u64 ret;
+        bh = sbi->s_lvid_bh;
+        if (!bh)
+                return 0;
+        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+        lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
+        mutex_lock(&sbi->s_alloc_mutex);
+        ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
+        if (!(++uniqueID & 0xFFFFFFFF))
+                uniqueID += 16;
+        lvhd->uniqueID = cpu_to_le64(uniqueID);
+        mutex_unlock(&sbi->s_alloc_mutex);
+        mark_buffer_dirty(bh);
+        return ret;
 }
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1886,8 +1917,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        struct kernel_lb_addr rootdir, fileset;
        struct udf_sb_info *sbi;
-        lock_kernel();
        uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
        uopt.uid = -1;
        uopt.gid = -1;
@@ -1896,10 +1925,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        uopt.dmode = UDF_INVALID_MODE;
        sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
-        if (!sbi) {
+        if (!sbi)
-                unlock_kernel();
                return -ENOMEM;
-        }
        sb->s_fs_info = sbi;
@@ -1936,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
        sbi->s_nls_map = uopt.nls_map;
+        rwlock_init(&sbi->s_cred_lock);
        if (uopt.session == 0xFFFFFFFF)
                sbi->s_session = udf_get_last_session(sb);
@@ -2045,7 +2073,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                goto error_out;
        }
        sb->s_maxbytes = MAX_LFS_FILESIZE;
-        unlock_kernel();
        return 0;
 error_out:
@@ -2066,7 +2093,6 @@ error_out:
        kfree(sbi);
        sb->s_fs_info = NULL;
-        unlock_kernel();
        return -EINVAL;
 }
@@ -2105,8 +2131,6 @@ static void udf_put_super(struct super_block *sb)
        sbi = UDF_SB(sb);
-        lock_kernel();
        if (sbi->s_vat_inode)
                iput(sbi->s_vat_inode);
        if (sbi->s_partitions)
@@ -2122,8 +2146,6 @@ static void udf_put_super(struct super_block *sb)
        kfree(sbi->s_partmaps);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int udf_sync_fs(struct super_block *sb, int wait)
@@ -2186,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        uint16_t ident;
        struct spaceBitmapDesc *bm;
-        lock_kernel();
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
        bh = udf_read_ptagged(sb, &loc, 0, &ident);
@@ -2224,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
                }
        }
        brelse(bh);
 out:
-        unlock_kernel();
        return accum;
 }
@@ -2240,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
        int8_t etype;
        struct extent_position epos;
-        lock_kernel();
+        mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
        epos.block = UDF_I(table)->i_location;
        epos.offset = sizeof(struct unallocSpaceEntry);
        epos.bh = NULL;
@@ -2250,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
                accum += (elen >> table->i_sb->s_blocksize_bits);
        brelse(epos.bh);
+        mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
-        unlock_kernel();
        return accum;
 }
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 16064787d2b7..b1d4488b0f14 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        int err = -EIO;
        unsigned char *p = kmap(page);
        struct udf_inode_info *iinfo;
+        uint32_t pos;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        pos = udf_block_map(inode, 0);
+        down_read(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
        } else {
-                bh = sb_bread(inode->i_sb, udf_block_map(inode, 0));
+                bh = sb_bread(inode->i_sb, pos);
                if (!bh)
                        goto out;
@@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
        brelse(bh);
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 out:
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index e58d1de41073..d1bd31ea724e 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,18 @@
 #ifndef _UDF_I_H
 #define _UDF_I_H
+/*
+ * The i_data_sem and i_mutex serve for protection of allocation information
+ * of a regular files and symlinks. This includes all extents belonging to
+ * the file/symlink, a fact whether data are in-inode or in external data
+ * blocks, preallocation, goal block information... When extents are read,
+ * i_mutex or i_data_sem must be held (for reading is enough in case of
+ * i_data_sem). When extents are changed, i_data_sem must be held for writing
+ * and also i_mutex must be held.
+ *
+ * For directories i_mutex is used for all the necessary protection.
+ */
 struct udf_inode_info {
        struct timespec         i_crtime;
        /* Physical address of inode */
@@ -21,6 +33,7 @@ struct udf_inode_info {
                struct long_ad          *i_lad;
                __u8            *i_data;
        } i_ext;
+        struct rw_semaphore     i_data_sem;
        struct inode vfs_inode;
 };
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d113b72c2768..4858c191242b 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -2,6 +2,7 @@
 #define __LINUX_UDF_SB_H
 #include <linux/mutex.h>
+#include <linux/bitops.h>
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC                 0x15013346
@@ -128,6 +129,8 @@ struct udf_sb_info {
        uid_t                   s_uid;
        mode_t                  s_fmode;
        mode_t                  s_dmode;
+        /* Lock protecting consistency of above permission settings */
+        rwlock_t                s_cred_lock;
        /* Root Info */
        struct timespec         s_record_time;
@@ -139,7 +142,7 @@ struct udf_sb_info {
        __u16                   s_udfrev;
        /* Miscellaneous flags */
-        __u32                   s_flags;
+        unsigned long           s_flags;
        /* Encoding info */
        struct nls_table        *s_nls_map;
@@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
 int udf_compute_nr_groups(struct super_block *sb, u32 partition);
-#define UDF_QUERY_FLAG(X,Y)                     ( UDF_SB(X)->s_flags & ( 1 << (Y) ) )
+static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
-#define UDF_SET_FLAG(X,Y)                       ( UDF_SB(X)->s_flags |= ( 1 << (Y) ) )
+{
-#define UDF_CLEAR_FLAG(X,Y)                     ( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) )
+        return test_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
+{
+        set_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
+{
+        clear_bit(flag, &UDF_SB(sb)->s_flags);
+}
 #endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 6995ab1f4305..eba48209f9f3 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -111,6 +111,8 @@ struct extent_position {
 };
 /* super.c */
+__attribute__((format(printf, 3, 4)))
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
 static inline void udf_updated_lvid(struct super_block *sb)
 {
@@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb)
        sb->s_dirt = 1;
        UDF_SB(sb)->s_lvid_dirty = 1;
 }
+extern u64 lvid_get_unique_id(struct super_block *sb);
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..faca44997099 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   kmem.o \
                                   xfs_aops.o \
                                   xfs_buf.o \
+                                   xfs_discard.o \
                                   xfs_export.o \
                                   xfs_file.o \
                                   xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-typedef struct sv_s {
-        wait_queue_head_t waiters;
-} sv_t;
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(TASK_UNINTERRUPTIBLE);
-        spin_unlock(lock);
-        schedule();
-        remove_wait_queue(&sv->waiters, &wait);
-}
-#define sv_init(sv,flag,name) \
-        init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-        /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock)
-#define sv_signal(sv) \
-        wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-        wake_up_all(&(sv)->waiters)
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f61223ed6..ec7bbb5645b6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-        IO_READ,        /* mapping for a read */
-        IO_DELAY,       /* mapping covers delalloc region */
-        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-        IO_NEW          /* just allocated */
-};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -244,10 +232,8 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IO_READ) {
+        error = xfs_setfilesize(ioend);
-                error = xfs_setfilesize(ioend);
+        ASSERT(!error || error == EAGAIN);
-                ASSERT(!error || error == EAGAIN);
-        }
        /*
         * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
-        ssize_t                 count,
        struct xfs_bmbt_irec    *imap,
-        int                     flags)
+        int                     type,
+        int                     nonblocking)
 {
-        int                     nmaps = 1;
+        struct xfs_inode        *ip = XFS_I(inode);
-        int                     new = 0;
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 count = 1 << inode->i_blkbits;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
+        if (type == IO_UNWRITTEN)
+                bmapi_flags |= XFS_BMAPI_IGSTATE;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                if (nonblocking)
+                        return -XFS_ERROR(EAGAIN);
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+        }
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               (ip->i_df.if_flags & XFS_IFEXTENTS));
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + count > mp->m_maxioffset)
+                count = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                          bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (error)
+                return -XFS_ERROR(error);
+        if (type == IO_DELALLOC &&
+            (!nimaps || isnullstartblock(imap->br_startblock))) {
+                error = xfs_iomap_write_allocate(ip, offset, count, imap);
+                if (!error)
+                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+                return -XFS_ERROR(error);
+        }
+#ifdef DEBUG
+        if (type == IO_UNWRITTEN) {
+                ASSERT(nimaps);
+                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+        }
+#endif
+        if (nimaps)
+                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+        return 0;
 }
 STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
                   WRITE_SYNC_PLUG : WRITE, bio);
-        ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-        bio_put(bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
        struct buffer_head      *bh)
 {
-        struct bio              *bio;
        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
-        do {
-                bio = bio_alloc(GFP_NOIO, nvecs);
-                nvecs >>= 1;
-        } while (!bio);
        ASSERT(bio->bi_private == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio_get(bio);
        return bio;
 }
@@ -470,9 +497,8 @@ xfs_submit_ioend(
        /* Pass 1 - start writeback */
        do {
                next = ioend->io_list;
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
                        xfs_start_buffer_writeback(bh);
-                }
        } while ((ioend = next) != NULL);
        /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        lock_buffer(bh);
        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
 }
 /*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-        struct page             *page,
-        unsigned int            pg_offset)
-{
-        struct buffer_head      *bh, *head;
-        int                     ret = 0;
-        if (PageWriteback(page))
-                return 0;
-        if (!PageDirty(page))
-                return 0;
-        if (!page->mapping)
-                return 0;
-        if (!page_has_buffers(page))
-                return 0;
-        bh = head = page_buffers(page);
-        do {
-                if (!buffer_uptodate(bh))
-                        break;
-                if (!buffer_mapped(bh))
-                        break;
-                ret += bh->b_size;
-                if (ret >= pg_offset)
-                        break;
-        } while ((bh = bh->b_this_page) != head);
-        return ret;
-}
-STATIC size_t
-xfs_probe_cluster(
-        struct inode            *inode,
-        struct page             *startpage,
-        struct buffer_head      *bh,
-        struct buffer_head      *head)
-{
-        struct pagevec          pvec;
-        pgoff_t                 tindex, tlast, tloff;
-        size_t                  total = 0;
-        int                     done = 0, i;
-        /* First sum forwards in this page */
-        do {
-                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-                        return total;
-                total += bh->b_size;
-        } while ((bh = bh->b_this_page) != head);
-        /* if we reached the end of the page, sum forwards in following pages */
-        tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-        tindex = startpage->index + 1;
-        /* Prune this back to avoid pathological behavior */
-        tloff = min(tlast, startpage->index + 64);
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tloff) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        struct page *page = pvec.pages[i];
-                        size_t pg_offset, pg_len = 0;
-                        if (tindex == tlast) {
-                                pg_offset =
-                                    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-                                if (!pg_offset) {
-                                        done = 1;
-                                        break;
-                                }
-                        } else
-                                pg_offset = PAGE_CACHE_SIZE;
-                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset);
-                                unlock_page(page);
-                        }
-                        if (!pg_len) {
-                                done = 1;
-                                break;
-                        }
-                        total += pg_len;
-                        tindex++;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-        return total;
-}
-/*
 * Test if a given page is suitable for writing as part of an unwritten
 * or delayed allocate extent.
 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
                        if (buffer_unwritten(bh))
                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IO_DELAY);
+                                acceptable = (type == IO_DELALLOC);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IO_NEW);
+                                acceptable = (type == IO_OVERWRITE);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
        loff_t                  tindex,
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
+        struct writeback_control *wbc)
-        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
                        continue;
                }
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
+                if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                    buffer_mapped(bh)) {
                        if (buffer_unwritten(bh))
                                type = IO_UNWRITTEN;
+                        else if (buffer_delay(bh))
+                                type = IO_DELALLOC;
                        else
-                                type = IO_DELAY;
+                                type = IO_OVERWRITE;
                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                        lock_buffer(bh);
-                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+                        if (type != IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_map_at_offset(inode, bh, imap, offset);
                        xfs_add_to_ioend(inode, bh, offset, type,
                                         ioendp, done);
                        page_dirty--;
                        count++;
                } else {
-                        type = IO_NEW;
+                        done = 1;
-                        if (buffer_mapped(bh) && all_bh) {
-                                lock_buffer(bh);
-                                xfs_add_to_ioend(inode, bh, offset,
-                                                type, ioendp, done);
-                                count++;
-                                page_dirty--;
-                        } else {
-                                done = 1;
-                        }
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
@@ -876,7 +790,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     all_bh,
        pgoff_t                 tlast)
 {
        struct pagevec          pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc, all_bh);
+                                        imap, ioendp, wbc);
                        if (done)
                                break;
                }
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        if (!xfs_is_delayed_page(page, IO_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELALLOC))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
-        ssize_t                 size, len;
+        ssize_t                 len;
-        int                     flags, err, imap_valid = 0, uptodate = 1;
+        int                     err, imap_valid = 0, uptodate = 1;
        int                     count = 0;
-        int                     all_bh = 0;
+        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0);
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
        bh = head = page_buffers(page);
        offset = page_offset(page);
-        flags = BMAPI_READ;
+        type = IO_OVERWRITE;
-        type = IO_NEW;
+        if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+                nonblocking = 1;
        do {
+                int new_ioend = 0;
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
                        continue;
                }
-                if (imap_valid)
+                if (buffer_unwritten(bh)) {
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                        if (type != IO_UNWRITTEN) {
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                        int new_ioend = 0;
-                        /*
-                         * Make sure we don't use a read-only iomap
-                         */
-                        if (flags == BMAPI_READ)
-                                imap_valid = 0;
-                        if (buffer_unwritten(bh)) {
                                type = IO_UNWRITTEN;
-                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
+                                imap_valid = 0;
-                        } else if (buffer_delay(bh)) {
-                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE)
-                                        flags |= BMAPI_TRYLOCK;
-                        }
-                        if (!imap_valid) {
-                                /*
-                                 * If we didn't have a valid mapping then we
-                                 * need to ensure that we put the new mapping
-                                 * in a new ioend structure. This needs to be
-                                 * done to ensure that the ioends correctly
-                                 * reflect the block mappings at io completion
-                                 * for unwritten extent conversion.
-                                 */
-                                new_ioend = 1;
-                                err = xfs_map_blocks(inode, offset, len,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
-                        if (imap_valid) {
+                } else if (buffer_delay(bh)) {
-                                xfs_map_at_offset(inode, bh, &imap, offset);
+                        if (type != IO_DELALLOC) {
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                type = IO_DELALLOC;
-                                                 &ioend, new_ioend);
+                                imap_valid = 0;
-                                count++;
                        }
                } else if (buffer_uptodate(bh)) {
-                        /*
+                        if (type != IO_OVERWRITE) {
-                         * we got here because the buffer is already mapped.
+                                type = IO_OVERWRITE;
-                         * That means it must already have extents allocated
+                                imap_valid = 0;
-                         * underneath it. Map the extent by reading it.
-                         */
-                        if (!imap_valid || flags != BMAPI_READ) {
-                                flags = BMAPI_READ;
-                                size = xfs_probe_cluster(inode, page, bh, head);
-                                err = xfs_map_blocks(inode, offset, size,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
+                } else {
+                        if (PageUptodate(page)) {
+                                ASSERT(buffer_mapped(bh));
+                                imap_valid = 0;
+                        }
+                        continue;
+                }
+                if (imap_valid)
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                if (!imap_valid) {
                        /*
-                         * We set the type to IO_NEW in case we are doing a
+                         * If we didn't have a valid mapping then we need to
-                         * small write at EOF that is extending the file but
+                         * put the new mapping into a separate ioend structure.
-                         * without needing an allocation. We need to update the
+                         * This ensures non-contiguous extents always have
-                         * file size on I/O completion in this case so it is
+                         * separate ioends, which is particularly important
-                         * the same case as having just allocated a new extent
+                         * for unwritten extent conversion at I/O completion
-                         * that we are writing into for the first time.
+                         * time.
                         */
-                        type = IO_NEW;
+                        new_ioend = 1;
-                        if (trylock_buffer(bh)) {
+                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                if (imap_valid)
+                                             nonblocking);
-                                        all_bh = 1;
+                        if (err)
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                goto error;
-                                                &ioend, !imap_valid);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                                count++;
+                }
-                        } else {
+                if (imap_valid) {
-                                imap_valid = 0;
+                        lock_buffer(bh);
-                        }
+                        if (type != IO_OVERWRITE)
-                } else if (PageUptodate(page)) {
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        ASSERT(buffer_mapped(bh));
+                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                        imap_valid = 0;
+                                         new_ioend);
+                        count++;
                }
                if (!iohead)
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
                        end_index = last_index;
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                        wbc, all_bh, end_index);
+                                  wbc, end_index);
        }
        if (iohead)
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
        int                     create,
        int                     direct)
 {
-        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     lockmode = 0;
        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     nimap = 1;
        int                     new = 0;
-        int                     error;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        if (direct && create)
+        if (create) {
-                flags |= BMAPI_DIRECT;
+                lockmode = XFS_ILOCK_EXCL;
+                xfs_ilock(ip, lockmode);
+        } else {
+                lockmode = xfs_ilock_map_shared(ip);
+        }
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + size > mp->m_maxioffset)
+                size = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-                          &new);
+                          XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
        if (error)
-                return -error;
+                goto out_unlock;
-        if (nimap == 0)
-                return 0;
+        if (create &&
+            (!nimaps ||
+             (imap.br_startblock == HOLESTARTBLOCK ||
+              imap.br_startblock == DELAYSTARTBLOCK))) {
+                if (direct) {
+                        error = xfs_iomap_write_direct(ip, offset, size,
+                                                       &imap, nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
+                }
+                if (error)
+                        goto out_unlock;
+                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+        } else if (nimaps) {
+                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+        } else {
+                trace_xfs_get_blocks_notfound(ip, offset, size);
+                goto out_unlock;
+        }
+        xfs_iunlock(ip, lockmode);
        if (imap.br_startblock != HOLESTARTBLOCK &&
            imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
        }
        return 0;
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        return -error;
 }
 int
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
        ssize_t                 ret;
        if (rw & WRITE) {
-                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+                iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_DIRECT = 0,  /* special case for direct I/O ioends */
+        IO_DELALLOC,    /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+#define XFS_IO_TYPES \
+        { 0,                    "" }, \
+        { IO_DELALLOC,          "delalloc" }, \
+        { IO_UNWRITTEN,         "unwritten" }, \
+        { IO_OVERWRITE,         "overwrite" }
+/*
 * xfs_ioend struct manages large extent writes for XFS.
 * It can manage several multi-page bio's at once.
 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6e9e31..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-        .shrink = xfsbufd_wakeup,
-        .seeks = DEFAULT_SEEKS,
-};
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
 }
 /*
- *      Internal xfs_buf_t object manipulation
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
 */
+STATIC void
+xfs_buf_lru_add(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        spin_lock(&btp->bt_lru_lock);
+        if (list_empty(&bp->b_lru)) {
+                atomic_inc(&bp->b_hold);
+                list_add_tail(&bp->b_lru, &btp->bt_lru);
+                btp->bt_lru_nr++;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        if (list_empty(&bp->b_lru))
+                return;
+        spin_lock(&btp->bt_lru_lock);
+        if (!list_empty(&bp->b_lru)) {
+                list_del_init(&bp->b_lru);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+        struct xfs_buf  *bp)
+{
+        bp->b_flags |= XBF_STALE;
+        atomic_set(&(bp)->b_lru_ref, 0);
+        if (!list_empty(&bp->b_lru)) {
+                struct xfs_buftarg *btp = bp->b_target;
+                spin_lock(&btp->bt_lru_lock);
+                if (!list_empty(&bp->b_lru)) {
+                        list_del_init(&bp->b_lru);
+                        btp->bt_lru_nr--;
+                        atomic_dec(&bp->b_hold);
+                }
+                spin_unlock(&btp->bt_lru_lock);
+        }
+        ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 STATIC void
 _xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
+        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
+        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
        RB_CLEAR_NODE(&bp->b_rbnode);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
+        ASSERT(list_empty(&bp->b_lru));
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -827,7 +896,7 @@ xfs_buf_rele(
        trace_xfs_buf_rele(bp, _RET_IP_);
        if (!pag) {
-                ASSERT(!bp->b_relse);
+                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
@@ -835,13 +904,15 @@ xfs_buf_rele(
        }
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-                if (bp->b_relse) {
+                if (!(bp->b_flags & XBF_STALE) &&
-                        atomic_inc(&bp->b_hold);
+                           atomic_read(&bp->b_lru_ref)) {
+                        xfs_buf_lru_add(bp);
                        spin_unlock(&pag->pag_buf_lock);
-                        bp->b_relse(bp);
                } else {
+                        xfs_buf_lru_del(bp);
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
                        spin_unlock(&pag->pag_buf_lock);
@@ -1438,51 +1509,84 @@ xfs_buf_iomove(
 */
 /*
- *      Wait for any bufs with callbacks that have been submitted but
+ * Wait for any bufs with callbacks that have been submitted but have not yet
- *      have not yet returned... walk the hash list for the target.
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
 */
 void
 xfs_wait_buftarg(
        struct xfs_buftarg      *btp)
 {
-        struct xfs_perag        *pag;
+        struct xfs_buf          *bp;
-        uint                    i;
-        for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+restart:
-                pag = xfs_perag_get(btp->bt_mount, i);
+        spin_lock(&btp->bt_lru_lock);
-                spin_lock(&pag->pag_buf_lock);
+        while (!list_empty(&btp->bt_lru)) {
-                while (rb_first(&pag->pag_buf_tree)) {
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-                        spin_unlock(&pag->pag_buf_lock);
+                if (atomic_read(&bp->b_hold) > 1) {
+                        spin_unlock(&btp->bt_lru_lock);
                        delay(100);
-                        spin_lock(&pag->pag_buf_lock);
+                        goto restart;
                }
-                spin_unlock(&pag->pag_buf_lock);
+                /*
-                xfs_perag_put(pag);
+                 * clear the LRU reference count so the bufer doesn't get
+                 * ignored in xfs_buf_rele().
+                 */
+                atomic_set(&bp->b_lru_ref, 0);
+                spin_unlock(&btp->bt_lru_lock);
+                xfs_buf_rele(bp);
+                spin_lock(&btp->bt_lru_lock);
        }
+        spin_unlock(&btp->bt_lru_lock);
 }
-/*
+int
- *      buftarg list for delwrite queue processing
+xfs_buftarg_shrink(
- */
+        struct shrinker         *shrink,
-static LIST_HEAD(xfs_buftarg_list);
+        int                     nr_to_scan,
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
+        gfp_t                   mask)
-STATIC void
-xfs_register_buftarg(
-        xfs_buftarg_t           *btp)
 {
-        spin_lock(&xfs_buftarg_lock);
+        struct xfs_buftarg      *btp = container_of(shrink,
-        list_add(&btp->bt_list, &xfs_buftarg_list);
+                                        struct xfs_buftarg, bt_shrinker);
-        spin_unlock(&xfs_buftarg_lock);
+        struct xfs_buf          *bp;
-}
+        LIST_HEAD(dispose);
-STATIC void
+        if (!nr_to_scan)
-xfs_unregister_buftarg(
+                return btp->bt_lru_nr;
-        xfs_buftarg_t           *btp)
-{
+        spin_lock(&btp->bt_lru_lock);
-        spin_lock(&xfs_buftarg_lock);
+        while (!list_empty(&btp->bt_lru)) {
-        list_del(&btp->bt_list);
+                if (nr_to_scan-- <= 0)
-        spin_unlock(&xfs_buftarg_lock);
+                        break;
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+                /*
+                 * Decrement the b_lru_ref count unless the value is already
+                 * zero. If the value is already zero, we need to reclaim the
+                 * buffer, otherwise it gets another trip through the LRU.
+                 */
+                if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
+                        continue;
+                }
+                /*
+                 * remove the buffer from the LRU now to avoid needing another
+                 * lock round trip inside xfs_buf_rele().
+                 */
+                list_move(&bp->b_lru, &dispose);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+        while (!list_empty(&dispose)) {
+                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+                list_del_init(&bp->b_lru);
+                xfs_buf_rele(bp);
+        }
+        return btp->bt_lru_nr;
 }
 void
@@ -1490,17 +1594,14 @@ xfs_free_buftarg(
        struct xfs_mount        *mp,
        struct xfs_buftarg      *btp)
 {
+        unregister_shrinker(&btp->bt_shrinker);
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
        iput(btp->bt_mapping->host);
-        /* Unregister the buftarg first so that we don't get a
-         * wakeup finding a non-existent task
-         */
-        xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
        kmem_free(btp);
 }
@@ -1597,20 +1698,13 @@ xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-        int     error = 0;
-        INIT_LIST_HEAD(&btp->bt_list);
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-        if (IS_ERR(btp->bt_task)) {
+        if (IS_ERR(btp->bt_task))
-                error = PTR_ERR(btp->bt_task);
+                return PTR_ERR(btp->bt_task);
-                goto out_error;
+        return 0;
-        }
-        xfs_register_buftarg(btp);
-out_error:
-        return error;
 }
 xfs_buftarg_t *
@@ -1627,12 +1721,17 @@ xfs_alloc_buftarg(
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
+        INIT_LIST_HEAD(&btp->bt_lru);
+        spin_lock_init(&btp->bt_lru_lock);
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
+        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+        register_shrinker(&btp->bt_shrinker);
        return btp;
 error:
@@ -1737,27 +1836,6 @@ xfs_buf_runall_queues(
        flush_workqueue(queue);
 }
-STATIC int
-xfsbufd_wakeup(
-        struct shrinker         *shrink,
-        int                     priority,
-        gfp_t                   mask)
-{
-        xfs_buftarg_t           *btp;
-        spin_lock(&xfs_buftarg_lock);
-        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-                        continue;
-                if (list_empty(&btp->bt_delwrite_queue))
-                        continue;
-                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-                wake_up_process(btp->bt_task);
-        }
-        spin_unlock(&xfs_buftarg_lock);
-        return 0;
-}
 /*
 * Move as many buffers as specified to the supplied list
 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1952,7 +2030,6 @@ xfs_buf_init(void)
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
-        register_shrinker(&xfs_buf_shake);
        return 0;
 out_destroy_xfsdatad_workqueue:
@@ -1968,7 +2045,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf98..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
        /* per device delwri queue */
        struct task_struct      *bt_task;
-        struct list_head        bt_list;
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
+        /* LRU control structures */
+        struct shrinker         bt_shrinker;
+        struct list_head        bt_lru;
+        spinlock_t              bt_lru_lock;
+        unsigned int            bt_lru_nr;
 } xfs_buftarg_t;
 /*
@@ -147,8 +152,6 @@ typedef struct xfs_buftarg {
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES        2
@@ -164,9 +167,11 @@ typedef struct xfs_buf {
        xfs_off_t               b_file_offset;  /* offset in file */
        size_t                  b_buffer_length;/* size of buffer in bytes */
        atomic_t                b_hold;         /* reference count */
+        atomic_t                b_lru_ref;      /* lru reclaim ref count */
        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
+        struct list_head        b_lru;          /* lru list */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;         /* contains rbtree root */
@@ -176,7 +181,6 @@ typedef struct xfs_buf {
        void                    *b_addr;        /* virtual address of buffer */
        struct work_struct      b_iodone_work;
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
-        xfs_buf_relse_t         b_relse;        /* releasing function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
@@ -264,7 +268,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)   ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)       xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp) do {                            \
@@ -315,7 +320,6 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_FSPRIVATE2(bp, type)            ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)         ((bp)->b_fspriv2 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                   do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)       ((bp)->b_relse = (func))
 #define XFS_BUF_PTR(bp)                 (xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)   xfs_buf_associate_memory(bp, val, cnt)
@@ -328,9 +332,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)                ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)       ((bp)->b_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    do { } while (0)
+static inline void
+xfs_buf_set_ref(
+        struct xfs_buf  *bp,
+        int             lru_ref)
+{
+        atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
 #define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
@@ -346,8 +356,7 @@ extern void xfs_buf_terminate(void);
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-        if (!bp->b_relse)
+        xfs_buf_unlock(bp);
-                xfs_buf_unlock(bp);
        xfs_buf_rele(bp);
 }
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..05201ae719e5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+STATIC int
+xfs_trim_extents(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_fsblock_t           start,
+        xfs_fsblock_t           len,
+        xfs_fsblock_t           minlen,
+        __uint64_t              *blocks_trimmed)
+{
+        struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
+        struct xfs_btree_cur    *cur;
+        struct xfs_buf          *agbp;
+        struct xfs_perag        *pag;
+        int                     error;
+        int                     i;
+        pag = xfs_perag_get(mp, agno);
+        error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+        if (error || !agbp)
+                goto out_put_perag;
+        cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+        /*
+         * Force out the log.  This means any transactions that might have freed
+         * space before we took the AGF buffer lock are now on disk, and the
+         * volatile disk cache is flushed.
+         */
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /*
+         * Look up the longest btree in the AGF and start with it.
+         */
+        error = xfs_alloc_lookup_le(cur, 0,
+                                    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+        if (error)
+                goto out_del_cursor;
+        /*
+         * Loop until we are done with all extents that are large
+         * enough to be worth discarding.
+         */
+        while (i) {
+                xfs_agblock_t fbno;
+                xfs_extlen_t flen;
+                error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+                if (error)
+                        goto out_del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+                ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+                /*
+                 * Too small?  Give up.
+                 */
+                if (flen < minlen) {
+                        trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+                        goto out_del_cursor;
+                }
+                /*
+                 * If the extent is entirely outside of the range we are
+                 * supposed to discard skip it.  Do not bother to trim
+                 * down partially overlapping ranges for now.
+                 */
+                if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+                    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+                        trace_xfs_discard_exclude(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                /*
+                 * If any blocks in the range are still busy, skip the
+                 * discard and try again the next time.
+                 */
+                if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+                        trace_xfs_discard_busy(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                trace_xfs_discard_extent(mp, agno, fbno, flen);
+                error = -blkdev_issue_discard(bdev,
+                                XFS_AGB_TO_DADDR(mp, agno, fbno),
+                                XFS_FSB_TO_BB(mp, flen),
+                                GFP_NOFS, 0);
+                if (error)
+                        goto out_del_cursor;
+                *blocks_trimmed += flen;
+next_extent:
+                error = xfs_btree_decrement(cur, 0, &i);
+                if (error)
+                        goto out_del_cursor;
+        }
+out_del_cursor:
+        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        xfs_buf_relse(agbp);
+out_put_perag:
+        xfs_perag_put(pag);
+        return error;
+}
+int
+xfs_ioc_trim(
+        struct xfs_mount                *mp,
+        struct fstrim_range __user      *urange)
+{
+        struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+        unsigned int            granularity = q->limits.discard_granularity;
+        struct fstrim_range     range;
+        xfs_fsblock_t           start, len, minlen;
+        xfs_agnumber_t          start_agno, end_agno, agno;
+        __uint64_t              blocks_trimmed = 0;
+        int                     error, last_error = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&range, urange, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        /*
+         * Truncating down the len isn't actually quite correct, but using
+         * XFS_B_TO_FSB would mean we trivially get overflows for values
+         * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+         * used by the fstrim application.  In the end it really doesn't
+         * matter as trimming blocks is an advisory interface.
+         */
+        start = XFS_B_TO_FSBT(mp, range.start);
+        len = XFS_B_TO_FSBT(mp, range.len);
+        minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+        start_agno = XFS_FSB_TO_AGNO(mp, start);
+        if (start_agno >= mp->m_sb.sb_agcount)
+                return -XFS_ERROR(EINVAL);
+        end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+        if (end_agno >= mp->m_sb.sb_agcount)
+                end_agno = mp->m_sb.sb_agcount - 1;
+        for (agno = start_agno; agno <= end_agno; agno++) {
+                error = -xfs_trim_extents(mp, agno, start, len, minlen,
+                                          &blocks_trimmed);
+                if (error)
+                        last_error = error;
+        }
+        if (last_error)
+                return last_error;
+        range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+        if (copy_to_user(urange, &range, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+struct fstrim_range;
+extern int      xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..fc0114da7fdd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
-        /* filesystem may contain 64bit inode numbers */
+        /*
-        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+         * If the the filesystem may contain 64bit inode numbers, we need
+         * to use larger file handles that can represent them.
+         *
+         * While we only allocate inodes that do not fit into 32 bits any
+         * large enough filesystem may contain them, thus the slightly
+         * confusing looking conditional below.
+         */
+        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+            (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
                fileid_type |= XFS_FILEID_TYPE_64FLAG;
        /*
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..a55c1b46b219 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
 #include "xfs_trace.h"
 #include <linux/dcache.h>
+#include <linux/falloc.h>
 static const struct vm_operations_struct xfs_file_vm_ops;
 /*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_lock(&VFS_I(ip)->i_mutex);
+        xfs_ilock(ip, type);
+}
+static inline void
+xfs_rw_iunlock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_iunlock(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+static inline void
+xfs_rw_ilock_demote(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_ilock_demote(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+/*
 *      xfs_iozero
 *
 *      xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        if (unlikely(ioflags & IO_ISDIRECT))
-                mutex_lock(&inode->i_mutex);
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (unlikely(ioflags & IO_ISDIRECT)) {
+                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
                        ret = -xfs_flushinval_pages(ip,
                                        (iocb->ki_pos & PAGE_CACHE_MASK),
                                        -1, FI_REMAPF_LOCKED);
+                        if (ret) {
+                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+                                return ret;
+                        }
                }
-                mutex_unlock(&inode->i_mutex);
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-                if (ret) {
+        } else
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-                        return ret;
-                }
-        }
        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -285,7 +319,7 @@ xfs_file_aio_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
@@ -309,7 +343,7 @@ xfs_file_splice_read(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
@@ -317,10 +351,61 @@ xfs_file_splice_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
+STATIC void
+xfs_aio_write_isize_update(
+        struct inode    *inode,
+        loff_t          *ppos,
+        ssize_t         bytes_written)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        xfs_fsize_t             isize = i_size_read(inode);
+        if (bytes_written > 0)
+                XFS_STATS_ADD(xs_write_bytes, bytes_written);
+        if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+                                        *ppos > isize))
+                *ppos = isize;
+        if (*ppos > ip->i_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                if (*ppos > ip->i_size)
+                        ip->i_size = *ppos;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occured.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+        struct xfs_inode        *ip)
+{
+        if (ip->i_new_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                ip->i_new_size = 0;
+                if (ip->i_d.di_size > ip->i_size)
+                        ip->i_d.di_size = ip->i_size;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
 STATIC ssize_t
 xfs_file_splice_write(
        struct pipe_inode_info  *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fsize_t             isize, new_size;
+        xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -355,27 +440,9 @@ xfs_file_splice_write(
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-        if (ret > 0)
-                XFS_STATS_ADD(xs_write_bytes, ret);
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-                *ppos = isize;
-        if (*ppos > ip->i_size) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (*ppos > ip->i_size)
-                        ip->i_size = *ppos;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        if (ip->i_new_size) {
+        xfs_aio_write_isize_update(inode, ppos, ret);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_aio_write_newsize_update(ip);
-                ip->i_new_size = 0;
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -562,247 +629,314 @@ out_lock:
        return error;
 }
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
 STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_aio_write_checks(
-        struct kiocb            *iocb,
+        struct file             *file,
-        const struct iovec      *iovp,
+        loff_t                  *pos,
-        unsigned long           nr_segs,
+        size_t                  *count,
-        loff_t                  pos)
+        int                     *iolock)
 {
-        struct file             *file = iocb->ki_filp;
+        struct inode            *inode = file->f_mapping->host;
-        struct address_space    *mapping = file->f_mapping;
-        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fsize_t             new_size;
-        ssize_t                 ret = 0, error = 0;
+        int                     error = 0;
-        int                     ioflags = 0;
-        xfs_fsize_t             isize, new_size;
-        int                     iolock;
-        size_t                  ocount = 0, count;
-        int                     need_i_mutex;
-        XFS_STATS_INC(xs_write_calls);
+        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+        if (error) {
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+                *iolock = 0;
+                return error;
+        }
-        BUG_ON(iocb->ki_pos != pos);
+        new_size = *pos + *count;
+        if (new_size > ip->i_size)
+                ip->i_new_size = new_size;
-        if (unlikely(file->f_flags & O_DIRECT))
+        if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-                ioflags |= IO_ISDIRECT;
+                file_update_time(file);
-        if (file->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
+        /*
+         * If the offset is beyond the size of the file, we need to zero any
+         * blocks that fall between the existing EOF and the start of this
+         * write.
+         */
+        if (*pos > ip->i_size)
+                error = -xfs_zero_eof(ip, *pos, ip->i_size);
-        error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
-        count = ocount;
+        /*
-        if (count == 0)
+         * If we're writing the file then make sure to clear the setuid and
-                return 0;
+         * setgid bits if the process is not being run by root.  This keeps
+         * people from modifying setuid and setgid binaries.
-        xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+         */
+        return file_remove_suid(file);
-        if (XFS_FORCED_SHUTDOWN(mp))
+}
-                return -EIO;
-relock:
+/*
-        if (ioflags & IO_ISDIRECT) {
+ * xfs_file_dio_aio_write - handle direct IO writes
-                iolock = XFS_IOLOCK_SHARED;
+ *
-                need_i_mutex = 0;
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
-        } else {
+ * By separating it from the buffered write path we remove all the tricky to
-                iolock = XFS_IOLOCK_EXCL;
+ * follow locking changes and looping.
-                need_i_mutex = 1;
+ *
-                mutex_lock(&inode->i_mutex);
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned long           nr_segs,
+        loff_t                  pos,
+        size_t                  ocount,
+        int                     *iolock)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 ret = 0;
+        size_t                  count = ocount;
+        int                     unaligned_io = 0;
+        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
+                                        mp->m_rtdev_targp : mp->m_ddev_targp;
+        *iolock = 0;
+        if ((pos & target->bt_smask) || (count & target->bt_smask))
+                return -XFS_ERROR(EINVAL);
+        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+                unaligned_io = 1;
+        if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+                *iolock = XFS_IOLOCK_EXCL;
+        else
+                *iolock = XFS_IOLOCK_SHARED;
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+        if (ret)
+                return ret;
+        if (mapping->nrpages) {
+                WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                                                        FI_REMAPF_LOCKED);
+                if (ret)
+                        return ret;
        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        /*
+         * If we are doing unaligned IO, wait for all other IO to drain,
-start:
+         * otherwise demote the lock if we had to flush cached pages
-        error = -generic_write_checks(file, &pos, &count,
+         */
-                                        S_ISBLK(inode->i_mode));
+        if (unaligned_io)
-        if (error) {
+                xfs_ioend_wait(ip);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        else if (*iolock == XFS_IOLOCK_EXCL) {
-                goto out_unlock_mutex;
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                *iolock = XFS_IOLOCK_SHARED;
        }
-        if (ioflags & IO_ISDIRECT) {
+        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-                xfs_buftarg_t   *target =
+        ret = generic_file_direct_write(iocb, iovp,
-                        XFS_IS_REALTIME_INODE(ip) ?
+                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+        /* No fallback to buffered IO on errors for XFS. */
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        ASSERT(ret < 0 || ret == count);
-                        return XFS_ERROR(-EINVAL);
+        return ret;
-                }
+}
-                if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
+STATIC ssize_t
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+xfs_file_buffered_aio_write(
-                        iolock = XFS_IOLOCK_EXCL;
+        struct kiocb            *iocb,
-                        need_i_mutex = 1;
+        const struct iovec      *iovp,
-                        mutex_lock(&inode->i_mutex);
+        unsigned long           nr_segs,
-                        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        loff_t                  pos,
-                        goto start;
+        size_t                  ocount,
-                }
+        int                     *iolock)
-        }
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     enospc = 0;
+        size_t                  count = ocount;
-        new_size = pos + count;
+        *iolock = XFS_IOLOCK_EXCL;
-        if (new_size > ip->i_size)
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
-                ip->i_new_size = new_size;
-        if (likely(!(ioflags & IO_INVIS)))
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
-                file_update_time(file);
+        if (ret)
+                return ret;
+        /* We can write back this queue in page reclaim */
+        current->backing_dev_info = mapping->backing_dev_info;
+write_retry:
+        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+                        pos, &iocb->ki_pos, count, ret);
        /*
-         * If the offset is beyond the size of the file, we have a couple
+         * if we just got an ENOSPC, flush the inode now we aren't holding any
-         * of things to do. First, if there is already space allocated
+         * page locks and retry *once*
-         * we need to either create holes or zero the disk or ...
-         *
-         * If there is a page where the previous size lands, we need
-         * to zero it out up to the new size.
         */
+        if (ret == -ENOSPC && !enospc) {
-        if (pos > ip->i_size) {
+                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                error = xfs_zero_eof(ip, pos, ip->i_size);
+                if (ret)
-                if (error) {
+                        return ret;
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                enospc = 1;
-                        goto out_unlock_internal;
+                goto write_retry;
-                }
        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        current->backing_dev_info = NULL;
+        return ret;
+}
-        /*
+STATIC ssize_t
-         * If we're writing the file then make sure to clear the
+xfs_file_aio_write(
-         * setuid and setgid bits if the process is not being run
+        struct kiocb            *iocb,
-         * by root.  This keeps people from modifying setuid and
+        const struct iovec      *iovp,
-         * setgid binaries.
+        unsigned long           nr_segs,
-         */
+        loff_t                  pos)
-        error = -file_remove_suid(file);
+{
-        if (unlikely(error))
+        struct file             *file = iocb->ki_filp;
-                goto out_unlock_internal;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     iolock;
+        size_t                  ocount = 0;
-        /* We can write back this queue in page reclaim */
+        XFS_STATS_INC(xs_write_calls);
-        current->backing_dev_info = mapping->backing_dev_info;
-        if ((ioflags & IO_ISDIRECT)) {
+        BUG_ON(iocb->ki_pos != pos);
-                if (mapping->nrpages) {
-                        WARN_ON(need_i_mutex == 0);
-                        error = xfs_flushinval_pages(ip,
-                                        (pos & PAGE_CACHE_MASK),
-                                        -1, FI_REMAPF_LOCKED);
-                        if (error)
-                                goto out_unlock_internal;
-                }
-                if (need_i_mutex) {
+        ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-                        /* demote the lock now the cached pages are gone */
+        if (ret)
-                        xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                return ret;
-                        mutex_unlock(&inode->i_mutex);
-                        iolock = XFS_IOLOCK_SHARED;
+        if (ocount == 0)
-                        need_i_mutex = 0;
+                return 0;
-                }
-                trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
+        xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
-                ret = generic_file_direct_write(iocb, iovp,
-                                &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                /*
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                 * direct-io write to a hole: fall through to buffered I/O
+                return -EIO;
-                 * for completing the rest of the request.
-                 */
-                if (ret >= 0 && ret != count) {
-                        XFS_STATS_ADD(xs_write_bytes, ret);
-                        pos += ret;
+        if (unlikely(file->f_flags & O_DIRECT))
-                        count -= ret;
+                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
+        else
+                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
-                        ioflags &= ~IO_ISDIRECT;
+        xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-                        xfs_iunlock(ip, iolock);
-                        goto relock;
-                }
-        } else {
-                int enospc = 0;
-                ssize_t ret2 = 0;
-write_retry:
+        if (ret <= 0)
-                trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
+                goto out_unlock;
-                ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
-                                pos, &iocb->ki_pos, count, ret);
-                /*
-                 * if we just got an ENOSPC, flush the inode now we
-                 * aren't holding any page locks and retry *once*
-                 */
-                if (ret2 == -ENOSPC && !enospc) {
-                        error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                        if (error)
-                                goto out_unlock_internal;
-                        enospc = 1;
-                        goto write_retry;
-                }
-                ret = ret2;
-        }
-        current->backing_dev_info = NULL;
+        /* Handle various SYNC-type writes */
+        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                loff_t end = pos + ret - 1;
+                int error, error2;
-        isize = i_size_read(inode);
+                xfs_rw_iunlock(ip, iolock);
-        if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
+                error = filemap_write_and_wait_range(mapping, pos, end);
-                iocb->ki_pos = isize;
+                xfs_rw_ilock(ip, iolock);
-        if (iocb->ki_pos > ip->i_size) {
+                error2 = -xfs_file_fsync(file,
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                                         (file->f_flags & __O_SYNC) ? 0 : 1);
-                if (iocb->ki_pos > ip->i_size)
+                if (error)
-                        ip->i_size = iocb->ki_pos;
+                        ret = error;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                else if (error2)
+                        ret = error2;
        }
-        error = -ret;
+out_unlock:
-        if (ret <= 0)
+        xfs_aio_write_newsize_update(ip);
-                goto out_unlock_internal;
+        xfs_rw_iunlock(ip, iolock);
+        return ret;
+}
-        XFS_STATS_ADD(xs_write_bytes, ret);
+STATIC long
+xfs_file_fallocate(
+        struct file     *file,
+        int             mode,
+        loff_t          offset,
+        loff_t          len)
+{
+        struct inode    *inode = file->f_path.dentry->d_inode;
+        long            error;
+        loff_t          new_size = 0;
+        xfs_flock64_t   bf;
+        xfs_inode_t     *ip = XFS_I(inode);
+        int             cmd = XFS_IOC_RESVSP;
-        /* Handle various SYNC-type writes */
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                return -EOPNOTSUPP;
-                loff_t end = pos + ret - 1;
-                int error2;
-                xfs_iunlock(ip, iolock);
+        bf.l_whence = 0;
-                if (need_i_mutex)
+        bf.l_start = offset;
-                        mutex_unlock(&inode->i_mutex);
+        bf.l_len = len;
-                error2 = filemap_write_and_wait_range(mapping, pos, end);
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                if (!error)
-                        error = error2;
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(ip, iolock);
-                error2 = -xfs_file_fsync(file,
+        if (mode & FALLOC_FL_PUNCH_HOLE)
-                                         (file->f_flags & __O_SYNC) ? 0 : 1);
+                cmd = XFS_IOC_UNRESVSP;
-                if (!error)
-                        error = error2;
+        /* check the new inode size is valid before allocating */
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                error = inode_newsize_ok(inode, new_size);
+                if (error)
+                        goto out_unlock;
        }
- out_unlock_internal:
+        error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
-        if (ip->i_new_size) {
+        if (error)
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                goto out_unlock;
-                ip->i_new_size = 0;
-                /*
+        /* Change file size if needed */
-                 * If this was a direct or synchronous I/O that failed (such
+        if (new_size) {
-                 * as ENOSPC) then part of the I/O may have been written to
+                struct iattr iattr;
-                 * disk before the error occured.  In this case the on-disk
-                 * file size may have been adjusted beyond the in-memory file
+                iattr.ia_valid = ATTR_SIZE;
-                 * size and now needs to be truncated back.
+                iattr.ia_size = new_size;
-                 */
+                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
-        xfs_iunlock(ip, iolock);
- out_unlock_mutex:
+out_unlock:
-        if (need_i_mutex)
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                mutex_unlock(&inode->i_mutex);
+        return error;
-        return -error;
 }
 STATIC int
 xfs_file_open(
        struct inode    *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
        .open           = xfs_file_open,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
+        .fallocate      = xfs_file_fallocate,
 };
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392e..b06ede1d0bed 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
@@ -1294,6 +1295,8 @@ xfs_file_ioctl(
        trace_xfs_file_ioctl(ip);
        switch (cmd) {
+        case FITRIM:
+                return xfs_ioc_trim(mp, arg);
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 94d5fd6a2973..bd5727852fd6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
 #include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/falloc.h>
 #include <linux/fiemap.h>
 #include <linux/slab.h>
@@ -505,58 +504,6 @@ xfs_vn_setattr(
        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
-STATIC long
-xfs_vn_fallocate(
-        struct inode    *inode,
-        int             mode,
-        loff_t          offset,
-        loff_t          len)
-{
-        long            error;
-        loff_t          new_size = 0;
-        xfs_flock64_t   bf;
-        xfs_inode_t     *ip = XFS_I(inode);
-        /* preallocation on directories not yet supported */
-        error = -ENODEV;
-        if (S_ISDIR(inode->i_mode))
-                goto out_error;
-        bf.l_whence = 0;
-        bf.l_start = offset;
-        bf.l_len = len;
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        /* check the new inode size is valid before allocating */
-        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-            offset + len > i_size_read(inode)) {
-                new_size = offset + len;
-                error = inode_newsize_ok(inode, new_size);
-                if (error)
-                        goto out_unlock;
-        }
-        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                       0, XFS_ATTR_NOLOCK);
-        if (error)
-                goto out_unlock;
-        /* Change file size if needed */
-        if (new_size) {
-                struct iattr iattr;
-                iattr.ia_valid = ATTR_SIZE;
-                iattr.ia_size = new_size;
-                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-        }
-out_unlock:
-        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
-        return error;
-}
 #define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 /*
@@ -650,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
        .getxattr               = generic_getxattr,
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
-        .fallocate              = xfs_vn_fallocate,
        .fiemap                 = xfs_vn_fiemap,
 };
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff79..096494997747 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
 #include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 064f964d4f3c..9731898083ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -606,7 +606,8 @@ xfs_blkdev_get(
 {
        int                     error = 0;
-        *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+        *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
                printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -620,7 +621,7 @@ xfs_blkdev_put(
        struct block_device     *bdev)
 {
        if (bdev)
-                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+                blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 /*
@@ -834,8 +835,11 @@ xfsaild_wakeup(
        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-        ailp->xa_target = threshold_lsn;
+        /* only ever move the target forwards */
-        wake_up_process(ailp->xa_task);
+        if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
+                ailp->xa_target = threshold_lsn;
+                wake_up_process(ailp->xa_task);
+        }
 }
 STATIC int
@@ -847,8 +851,17 @@ xfsaild(
        long            tout = 0; /* milliseconds */
        while (!kthread_should_stop()) {
-                schedule_timeout_interruptible(tout ?
+                /*
-                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+                 * for short sleeps indicating congestion, don't allow us to
+                 * get woken early. Otherwise all we do is bang on the AIL lock
+                 * without making progress.
+                 */
+                if (tout && tout <= 20)
+                        __set_current_state(TASK_KILLABLE);
+                else
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(tout ?
+                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
                /* swsusp */
                try_to_freeze();
@@ -935,7 +948,7 @@ out_reclaim:
 * Slab object creation initialisation for the XFS inode.
 * This covers only the idempotent fields in the XFS inode;
 * all other fields need to be initialised on allocation
- * from the slab. This avoids the need to repeatedly intialise
+ * from the slab. This avoids the need to repeatedly initialise
 * fields in the xfs inode that left in the initialise state
 * when freeing the inode.
 */
@@ -1118,6 +1131,8 @@ xfs_fs_evict_inode(
         */
        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
        xfs_inactive(ip);
 }
@@ -1399,7 +1414,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+        return -xfs_fs_log_dummy(mp);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1c..e22f0057d21f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
+        ASSERT(rcu_read_lock_held());
+        /*
+         * check for stale RCU freed inode
+         *
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!ip->i_ino)
+                goto out_unlock_noent;
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                return ENOENT;
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
        /* inode is valid */
        return 0;
+out_unlock_noent:
+        spin_unlock(&ip->i_flags_lock);
+        return ENOENT;
 }
 STATIC int
@@ -98,12 +118,12 @@ restart:
                int             error = 0;
                int             i;
-                read_lock(&pag->pag_ici_lock);
+                rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        break;
                }
@@ -118,18 +138,26 @@ restart:
                                batch[i] = NULL;
                        /*
-                         * Update the index for the next lookup. Catch overflows
+                         * Update the index for the next lookup. Catch
-                         * into the next AG range which can occur if we have inodes
+                         * overflows into the next AG range which can occur if
-                         * in the last block of the AG and we are currently
+                         * we have inodes in the last block of the AG and we
-                         * pointing to the last inode.
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
                /* unlock now we've grabbed the inodes. */
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -334,7 +362,7 @@ xfs_quiesce_data(
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+                error2 = xfs_fs_log_dummy(mp);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -475,13 +503,14 @@ xfs_sync_worker(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
-                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                if (mp->m_super->s_frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
-                        error = xfs_fs_log_dummy(mp, 0);
+                        error = xfs_fs_log_dummy(mp);
+                else
+                        xfs_log_force(mp, 0);
+                xfs_reclaim_inodes(mp, 0);
+                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
@@ -639,9 +668,14 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
        /*
-         * do some unlocked checks first to avoid unnecceary lock traffic.
+         * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
@@ -654,11 +688,16 @@ xfs_reclaim_inode_grab(
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
+                /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
@@ -795,12 +834,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +903,14 @@ restart:
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
-                        write_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                                write_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                break;
                        }
@@ -891,14 +930,24 @@ restart:
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
                        /* unlock now we've grabbed the inodes. */
-                        write_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee3cee097e7e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include "xfs_error.h"
 static struct ctl_table_header *xfs_table_header;
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
        return ret;
 }
+STATIC int
+xfs_panic_mask_proc_handler(
+        ctl_table       *ctl,
+        int             write,
+        void            __user *buffer,
+        size_t          *lenp,
+        loff_t          *ppos)
+{
+        int             ret, *valp = ctl->data;
+        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+        if (!ret && write) {
+                xfs_panic_mask = *valp;
+#ifdef DEBUG
+                xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+        }
+        return ret;
+}
 #endif /* CONFIG_PROC_FS */
 static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = xfs_panic_mask_proc_handler,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c594..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
-                __field(void *, reserve_headq)
+                __field(int, reserveq)
-                __field(void *, write_headq)
+                __field(int, writeq)
                __field(int, grant_reserve_cycle)
                __field(int, grant_reserve_bytes)
                __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-                __entry->reserve_headq = log->l_reserve_headq;
+                __entry->reserveq = list_empty(&log->l_reserveq);
-                __entry->write_headq = log->l_write_headq;
+                __entry->writeq = list_empty(&log->l_writeq);
-                __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+                xlog_crack_grant_head(&log->l_grant_reserve_head,
-                __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+                                &__entry->grant_reserve_cycle,
-                __entry->grant_write_cycle = log->l_grant_write_cycle;
+                                &__entry->grant_reserve_bytes);
-                __entry->grant_write_bytes = log->l_grant_write_bytes;
+                xlog_crack_grant_head(&log->l_grant_write_head,
+                                &__entry->grant_write_cycle,
+                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
-                __entry->tail_lsn = log->l_tail_lsn;
+                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+                  "t_unit_res %u t_flags %s reserveq %s "
-                  "write_headq 0x%p grant_reserve_cycle %d "
+                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                  __entry->reserve_headq,
+                  __entry->reserveq ? "empty" : "active",
-                  __entry->write_headq,
+                  __entry->writeq ? "empty" : "active",
                  __entry->grant_reserve_cycle,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                 int flags, struct xfs_bmbt_irec *irec),
+                 int type, struct xfs_bmbt_irec *irec),
-        TP_ARGS(ip, offset, count, flags, irec),
+        TP_ARGS(ip, offset, count, type, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
-                __field(int, flags)
+                __field(int, type)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
-                __entry->flags = flags;
+                __entry->type = type;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                  "offset 0x%llx count %zd flags %s "
+                  "offset 0x%llx count %zd type %s "
                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+                  __print_symbolic(__entry->type, XFS_IO_TYPES),
                  __entry->startoff,
                  (__int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 #define DEFINE_IOMAP_EVENT(name)        \
-DEFINE_EVENT(xfs_iomap_class, name,     \
+DEFINE_EVENT(xfs_imap_class, name,      \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                 int flags, struct xfs_bmbt_irec *irec),                \
+                 int type, struct xfs_bmbt_irec *irec),         \
-        TP_ARGS(ip, offset, count, flags, irec))
+        TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_discard_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                 xfs_agblock_t agbno, xfs_extlen_t len),
+        TP_ARGS(mp, agno, agbno, len),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, len)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->agbno = agbno;
+                __entry->len = len;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->len)
+)
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                 xfs_agblock_t agbno, xfs_extlen_t len), \
+        TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a12..d22aa3103106 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
-        sv_destroy(&dqp->q_pinwait);
        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
        atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a47..0df88897ef84 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
-static char             message[1024];  /* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL       7
-#define XFS_ERR_MASK            ((1 << 3) - 1)
-static const char * const       err_level[XFS_MAX_ERR_LEVEL+1] =
-                                        {KERN_EMERG, KERN_ALERT, KERN_CRIT,
-                                         KERN_ERR, KERN_WARNING, KERN_NOTICE,
-                                         KERN_INFO, KERN_DEBUG};
 void
-cmn_err(register int level, char *fmt, ...)
+cmn_err(
+        const char      *lvl,
+        const char      *fmt,
+        ...)
 {
-        char    *fp = fmt;
+        struct va_format vaf;
-        int     len;
+        va_list         args;
-        ulong   flags;
-        va_list ap;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
-        level &= XFS_ERR_MASK;
+        vaf.va = &args;
-        if (level > XFS_MAX_ERR_LEVEL)
-                level = XFS_MAX_ERR_LEVEL;
+        printk("%s%pV", lvl, &vaf);
-        spin_lock_irqsave(&xfs_err_lock,flags);
+        va_end(args);
-        va_start(ap, fmt);
-        if (*fmt == '!') fp++;
+        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-        len = vsnprintf(message, sizeof(message), fp, ap);
-        if (len >= sizeof(message))
-                len = sizeof(message) - 1;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
-        va_end(ap);
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
 }
 void
-xfs_fs_vcmn_err(
+xfs_fs_cmn_err(
-        int                     level,
+        const char              *lvl,
        struct xfs_mount        *mp,
-        char                    *fmt,
+        const char              *fmt,
-        va_list                 ap)
+        ...)
 {
-        unsigned long           flags;
+        struct va_format        vaf;
-        int                     len = 0;
+        va_list                 args;
-        level &= XFS_ERR_MASK;
+        va_start(args, fmt);
-        if (level > XFS_MAX_ERR_LEVEL)
+        vaf.fmt = fmt;
-                level = XFS_MAX_ERR_LEVEL;
+        vaf.va = &args;
-        spin_lock_irqsave(&xfs_err_lock,flags);
+        printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
+        va_end(args);
-        if (mp) {
+        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-                len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+}
+/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
+void
+xfs_cmn_err(
+        int                     panic_tag,
+        const char              *lvl,
+        struct xfs_mount        *mp,
+        const char              *fmt,
+        ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        int                     do_panic = 0;
-                /*
+        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-                 * Skip the printk if we can't print anything useful
+                printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
-                 * due to an over-long device name.
+                do_panic = 1;
-                 */
-                if (len >= sizeof(message))
-                        goto out;
        }
-        len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
+        va_start(args, fmt);
-        if (len >= sizeof(message))
+        vaf.fmt = fmt;
-                len = sizeof(message) - 1;
+        vaf.va = &args;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
+        printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
- out:
+        va_end(args);
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
+        BUG_ON(do_panic);
 }
 void
 assfail(char *expr, char *file, int line)
 {
-        printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
+        printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
+               file, line);
        BUG();
 }
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4f..05699f67d475 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
 #include <stdarg.h>
-#define CE_DEBUG        7               /* debug        */
+struct xfs_mount;
-#define CE_CONT         6               /* continuation */
-#define CE_NOTE         5               /* notice       */
+#define CE_DEBUG        KERN_DEBUG
-#define CE_WARN         4               /* warning      */
+#define CE_CONT         KERN_INFO
-#define CE_ALERT        1               /* alert        */
+#define CE_NOTE         KERN_NOTICE
-#define CE_PANIC        0               /* panic        */
+#define CE_WARN         KERN_WARNING
+#define CE_ALERT        KERN_ALERT
-extern void cmn_err(int, char *, ...)
+#define CE_PANIC        KERN_EMERG
-        __attribute__ ((format (printf, 2, 3)));
+void cmn_err(const char *lvl, const char *fmt, ...)
+                __attribute__ ((format (printf, 2, 3)));
+void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
+                const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
+                const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
 extern void assfail(char *expr, char *f, int l);
 #define ASSERT_ALWAYS(expr)     \
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c022..58632cc17f2d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        rwlock_t        pag_ici_lock;   /* incore inode lock */
+        spinlock_t      pag_ici_lock;   /* incore inode cache lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca5..f3227984a9bf 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-static int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                    xfs_agblock_t bno, xfs_extlen_t len);
 /*
 * Prototypes for per-ag allocation routines
 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
 * Lookup the first record less than or equal to [bno, len]
 * in the btree given by cur.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_lookup_le(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           bno,    /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
 /*
 * Get the data from the pointed-to record.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_get_rec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           *bno,   /* output: starting block of extent */
@@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact(
        xfs_extlen_t    rlen;   /* length of returned extent */
        ASSERT(args->alignment == 1);
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO);
+                                          args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
         * if any free block does.
         */
-        if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+        if (error)
                goto error0;
-        if (!i) {
+        if (!i)
-                /*
+                goto not_found;
-                 * Didn't find it, return null.
-                 */
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * Grab the freespace record.
         */
-        if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+        if (error)
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
        minend = args->agbno + args->minlen;
        maxend = args->agbno + args->maxlen;
        fend = fbno + flen;
        /*
         * Give up if the freespace isn't long enough for the minimum request.
         */
-        if (fend < minend) {
+        if (fend < minend)
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * End of extent will be smaller of the freespace end and the
         * maximal requested end.
-         */
+         *
-        end = XFS_AGBLOCK_MIN(fend, maxend);
-        /*
         * Fix the length according to mod and prod if given.
         */
+        end = XFS_AGBLOCK_MIN(fend, maxend);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
-        if (!xfs_alloc_fix_minleft(args)) {
+        if (!xfs_alloc_fix_minleft(args))
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                return 0;
-        }
        rlen = args->len;
        ASSERT(args->agbno + rlen <= fend);
        end = args->agbno + rlen;
        /*
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact(
                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-                        args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+                                      args->len, XFSA_FIXUP_BNO_OK);
+        if (error) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
                goto error0;
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-        trace_xfs_alloc_exact_done(args);
        args->wasfromfl = 0;
+        trace_xfs_alloc_exact_done(args);
+        return 0;
+not_found:
+        /* Didn't find it, return null. */
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        args->agbno = NULLAGBLOCK;
+        trace_xfs_alloc_exact_notfound(args);
        return 0;
 error0:
@@ -659,6 +661,95 @@ error0:
 }
 /*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+        struct xfs_alloc_arg    *args,  /* allocation argument structure */
+        struct xfs_btree_cur    **gcur, /* good cursor */
+        struct xfs_btree_cur    **scur, /* searching cursor */
+        xfs_agblock_t           gdiff,  /* difference for search comparison */
+        xfs_agblock_t           *sbno,  /* extent found by search */
+        xfs_extlen_t            *slen,
+        xfs_extlen_t            *slena, /* aligned length */
+        int                     dir)    /* 0 = search right, 1 = search left */
+{
+        xfs_agblock_t           bno;
+        xfs_agblock_t           new;
+        xfs_agblock_t           sdiff;
+        int                     error;
+        int                     i;
+        /* The good extent is perfect, no need to  search. */
+        if (!gdiff)
+                goto out_use_good;
+        /*
+         * Look until we find a better one, run out of space or run off the end.
+         */
+        do {
+                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+                                          args->minlen, &bno, slena);
+                /*
+                 * The good extent is closer than this one.
+                 */
+                if (!dir) {
+                        if (bno >= args->agbno + gdiff)
+                                goto out_use_good;
+                } else {
+                        if (bno <= args->agbno - gdiff)
+                                goto out_use_good;
+                }
+                /*
+                 * Same distance, compare length and pick the best.
+                 */
+                if (*slena >= args->minlen) {
+                        args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                       args->alignment, *sbno,
+                                                       *slen, &new);
+                        /*
+                         * Choose closer size and invalidate other cursor.
+                         */
+                        if (sdiff < gdiff)
+                                goto out_use_search;
+                        goto out_use_good;
+                }
+                if (!dir)
+                        error = xfs_btree_increment(*scur, 0, &i);
+                else
+                        error = xfs_btree_decrement(*scur, 0, &i);
+                if (error)
+                        goto error0;
+        } while (i);
+out_use_good:
+        xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+        *scur = NULL;
+        return 0;
+out_use_search:
+        xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+        *gcur = NULL;
+        return 0;
+error0:
+        /* caller invalidates cursors */
+        return error;
+}
+/*
 * Allocate a variable extent near bno in the allocation group agno.
 * Extent's length (returned in len) will be between minlen and maxlen,
 * and of the form k * prod + mod unless there's nothing that large.
@@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near(
                        }
                }
        } while (bno_cur_lt || bno_cur_gt);
        /*
         * Got both cursors still active, need to find better entry.
         */
        if (bno_cur_lt && bno_cur_gt) {
-                /*
-                 * Left side is long enough, look for a right side entry.
-                 */
                if (ltlena >= args->minlen) {
                        /*
-                         * Fix up the length.
+                         * Left side is good, look for a right side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, ltbno, ltlen, &ltnew);
+                        error = xfs_alloc_find_best_extent(args,
+                                                &bno_cur_lt, &bno_cur_gt,
+                                                ltdiff, &gtbno, &gtlen, &gtlena,
+                                                0 /* search right */);
+                } else {
+                        ASSERT(gtlena >= args->minlen);
                        /*
-                         * Not perfect.
+                         * Right side is good, look for a left side entry.
-                         */
-                        if (ltdiff) {
-                                /*
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_gt, &gtbno,
-                                                        &gtlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(gtbno, gtlen,
-                                                args->alignment, args->minlen,
-                                                &gtbnoa, &gtlena);
-                                        /*
-                                         * The left one is clearly better.
-                                         */
-                                        if (gtbnoa >= args->agbno + ltdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (gtlena >= args->minlen) {
-                                                args->len =
-                                                        XFS_EXTLEN_MIN(gtlena,
-                                                                args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                gtdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        gtbno, gtlen, &gtnew);
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                if (gtdiff < ltdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the right end.
-                                         */
-                                        if ((error = xfs_btree_increment(
-                                                        bno_cur_gt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The left side is perfect, trash the right side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_gt,
-                                                     XFS_BTREE_NOERROR);
-                                bno_cur_gt = NULL;
-                        }
-                }
-                /*
-                 * It's the right side that was found first, look left.
-                 */
-                else {
-                        /*
-                         * Fix up the length.
                         */
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, gtbno, gtlen, &gtnew);
-                        /*
-                         * Right side entry isn't perfect.
+                        error = xfs_alloc_find_best_extent(args,
-                         */
+                                                &bno_cur_gt, &bno_cur_lt,
-                        if (gtdiff) {
+                                                gtdiff, &ltbno, &ltlen, &ltlena,
-                                /*
+                                                1 /* search left */);
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_lt, &ltbno,
-                                                        &ltlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(ltbno, ltlen,
-                                                args->alignment, args->minlen,
-                                                &ltbnoa, &ltlena);
-                                        /*
-                                         * The right one is clearly better.
-                                         */
-                                        if (ltbnoa <= args->agbno - gtdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (ltlena >= args->minlen) {
-                                                args->len = XFS_EXTLEN_MIN(
-                                                        ltlena, args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                ltdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        ltbno, ltlen, &ltnew);
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                if (ltdiff < gtdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the left end.
-                                         */
-                                        if ((error = xfs_btree_decrement(
-                                                        bno_cur_lt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The right side is perfect, trash the left side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_lt,
-                                        XFS_BTREE_NOERROR);
-                                bno_cur_lt = NULL;
-                        }
                }
+                if (error)
+                        goto error0;
        }
        /*
         * If we couldn't get anything, give up.
         */
@@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near(
                args->agbno = NULLAGBLOCK;
                return 0;
        }
        /*
         * At this point we have selected a freespace entry, either to the
         * left or to the right.  If it's on the right, copy all the
@@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near(
                j = 1;
        } else
                j = 0;
        /*
         * Fix up the length and compute the useful address.
         */
@@ -2676,7 +2611,7 @@ restart:
 * will require a synchronous transaction, but it can still be
 * used to distinguish between a partial or exact match.
 */
-static int
+int
 xfs_alloc_busy_search(
        struct xfs_mount        *mp,
        xfs_agnumber_t          agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..0ab56b32c7eb 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
 #define __XFS_ALLOC_H__
 struct xfs_buf;
+struct xfs_btree_cur;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
@@ -118,16 +119,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
                struct xfs_perag *pag);
 #ifdef __KERNEL__
 void
-xfs_alloc_busy_insert(xfs_trans_t *tp,
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
-                xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len);
-                xfs_agblock_t bno,
-                xfs_extlen_t len);
 void
 xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len);
 #endif  /* __KERNEL__ */
 /*
@@ -205,4 +206,18 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
+int                                     /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+int                                     /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat); /* output: success/failure */
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
        /*
         * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
-                                args.value = kmem_alloc(valuelen, KM_SLEEP);
+                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7e..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
                return error;
        }
        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-        if (bp != NULL) {
+        if (bp)
                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-        }
        *bpp = bp;
        return 0;
 }
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
        switch (cur->bc_btnum) {
        case XFS_BTNUM_BNO:
        case XFS_BTNUM_CNT:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
                break;
        default:
                ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5b..98c6f73b6752 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,8 +141,7 @@ xfs_buf_item_log_check(
 #define         xfs_buf_item_log_check(x)
 #endif
-STATIC void     xfs_buf_error_relse(xfs_buf_t *bp);
+STATIC void     xfs_buf_do_callbacks(struct xfs_buf *bp);
-STATIC void     xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
 /*
 * This returns the number of log iovecs needed to log the
@@ -450,7 +449,7 @@ xfs_buf_item_unpin(
                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+                        xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
@@ -918,15 +917,26 @@ xfs_buf_attach_iodone(
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
 STATIC void
 xfs_buf_do_callbacks(
-        xfs_buf_t       *bp,
+        struct xfs_buf          *bp)
-        xfs_log_item_t  *lip)
 {
-        xfs_log_item_t  *nlip;
+        struct xfs_log_item     *lip;
-        while (lip != NULL) {
+        while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
-                nlip = lip->li_bio_list;
+                XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
                ASSERT(lip->li_cb != NULL);
                /*
                 * Clear the next pointer so we don't have any
@@ -936,7 +946,6 @@ xfs_buf_do_callbacks(
                 */
                lip->li_bio_list = NULL;
                lip->li_cb(bp, lip);
-                lip = nlip;
        }
 }
@@ -949,128 +958,76 @@ xfs_buf_do_callbacks(
 */
 void
 xfs_buf_iodone_callbacks(
-        xfs_buf_t       *bp)
+        struct xfs_buf          *bp)
 {
-        xfs_log_item_t  *lip;
+        struct xfs_log_item     *lip = bp->b_fspriv;
-        static ulong    lasttime;
+        struct xfs_mount        *mp = lip->li_mountp;
-        static xfs_buftarg_t *lasttarg;
+        static ulong            lasttime;
-        xfs_mount_t     *mp;
+        static xfs_buftarg_t    *lasttarg;
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        if (likely(!XFS_BUF_GETERROR(bp)))
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                goto do_callbacks;
-        if (XFS_BUF_GETERROR(bp) != 0) {
+        /*
-                /*
+         * If we've already decided to shutdown the filesystem because of
-                 * If we've already decided to shutdown the filesystem
+         * I/O errors, there's no point in giving this a retry.
-                 * because of IO errors, there's no point in giving this
+         */
-                 * a retry.
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                 */
+                XFS_BUF_SUPER_STALE(bp);
-                mp = lip->li_mountp;
+                trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                goto do_callbacks;
-                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
+        }
-                        XFS_BUF_SUPER_STALE(bp);
-                        trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                        xfs_buf_do_callbacks(bp, lip);
-                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
-                        XFS_BUF_CLR_IODONE_FUNC(bp);
-                        xfs_buf_ioend(bp, 0);
-                        return;
-                }
-                if ((XFS_BUF_TARGET(bp) != lasttarg) ||
+        if (XFS_BUF_TARGET(bp) != lasttarg ||
-                    (time_after(jiffies, (lasttime + 5*HZ)))) {
+            time_after(jiffies, (lasttime + 5*HZ))) {
-                        lasttime = jiffies;
+                lasttime = jiffies;
-                        cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+                cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
-                                        " block 0x%llx in %s",
+                                " block 0x%llx in %s",
-                                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+                        XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                              (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+                      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
-                }
+        }
-                lasttarg = XFS_BUF_TARGET(bp);
+        lasttarg = XFS_BUF_TARGET(bp);
-                if (XFS_BUF_ISASYNC(bp)) {
+        /*
-                        /*
+         * If the write was asynchronous then noone will be looking for the
-                         * If the write was asynchronous then noone will be
+         * error.  Clear the error state and write the buffer out again.
-                         * looking for the error.  Clear the error state
+         *
-                         * and write the buffer out again delayed write.
+         * During sync or umount we'll write all pending buffers again
-                         *
+         * synchronous, which will catch these errors if they keep hanging
-                         * XXXsup This is OK, so long as we catch these
+         * around.
-                         * before we start the umount; we don't want these
+         */
-                         * DELWRI metadata bufs to be hanging around.
+        if (XFS_BUF_ISASYNC(bp)) {
-                         */
+                XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
-                        XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
+                if (!XFS_BUF_ISSTALE(bp)) {
-                        if (!(XFS_BUF_ISSTALE(bp))) {
+                        XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DONE(bp);
-                                XFS_BUF_SET_START(bp);
-                        }
-                        ASSERT(XFS_BUF_IODONE_FUNC(bp));
-                        trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-                        xfs_buf_relse(bp);
-                } else {
-                        /*
-                         * If the write of the buffer was not asynchronous,
-                         * then we want to make sure to return the error
-                         * to the caller of bwrite().  Because of this we
-                         * cannot clear the B_ERROR state at this point.
-                         * Instead we install a callback function that
-                         * will be called when the buffer is released, and
-                         * that routine will clear the error state and
-                         * set the buffer to be written out again after
-                         * some delay.
-                         */
-                        /* We actually overwrite the existing b-relse
-                           function at times, but we're gonna be shutting down
-                           anyway. */
-                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_FINISH_IOWAIT(bp);
+                        XFS_BUF_SET_START(bp);
                }
+                ASSERT(XFS_BUF_IODONE_FUNC(bp));
+                trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+                xfs_buf_relse(bp);
                return;
        }
-        xfs_buf_do_callbacks(bp, lip);
+        /*
-        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+         * If the write of the buffer was synchronous, we want to make
-        XFS_BUF_CLR_IODONE_FUNC(bp);
+         * sure to return the error to the caller of xfs_bwrite().
-        xfs_buf_ioend(bp, 0);
+         */
-}
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
-        xfs_buf_t       *bp)
-{
-        xfs_log_item_t  *lip;
-        xfs_mount_t     *mp;
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-        mp = (xfs_mount_t *)lip->li_mountp;
-        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
        XFS_BUF_STALE(bp);
        XFS_BUF_DONE(bp);
        XFS_BUF_UNDELAYWRITE(bp);
-        XFS_BUF_ERROR(bp,0);
        trace_xfs_buf_error_relse(bp, _RET_IP_);
+        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-        if (! XFS_FORCED_SHUTDOWN(mp))
+do_callbacks:
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+        xfs_buf_do_callbacks(bp);
-        /*
-         * We have to unpin the pinned buffers so do the
-         * callbacks.
-         */
-        xfs_buf_do_callbacks(bp, lip);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
+        xfs_buf_ioend(bp, 0);
-        xfs_buf_relse(bp);
 }
 /*
 * This is the iodone() function for buffers which have been
 * logged.  It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
        xfs_buf_log_format_t    bli_format;     /* in-log header */
 } xfs_buf_log_item_t;
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
-        xfs_daddr_t             bc_blkno;
-        uint                    bc_len;
-        int                     bc_refcount;
-        struct xfs_buf_cancel   *bc_next;
-} xfs_buf_cancel_t;
 void    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void    xfs_buf_item_relse(struct xfs_buf *);
 void    xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c78cc6a3d87c..4c7db74a05f7 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
-void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
-void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-#ifdef DEBUG
-        xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
-            && (level & CE_ALERT)) {
-                level &= ~CE_ALERT;
-                level |= CE_PANIC;
-                cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
-        }
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
 void
 xfs_error_report(
        const char              *tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index f338847f80b8..10dce5475f02 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
                        (rf))))
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
 #define xfs_errortag_add(tag, mp)               (ENOSYS)
@@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 struct xfs_mount;
-extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
-                char *fmt, va_list ap)
-        __attribute__ ((format (printf, 3, 0)));
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                        char *fmt, ...)
-        __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_hex_dump(void *p, int length);
 #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
        xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
 #define xfs_fs_mount_cmn_err(f, fmt, args...) \
-        ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
+        do { \
+                if (!(f & XFS_MFSI_QUIET))      \
+                        cmn_err(CE_WARN, "XFS: " fmt, ## args); \
+        } while (0)
 #endif  /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..75f2ef60e579 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
 }
 /*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+        struct xfs_efi_log_item *efip)
+{
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+                spin_lock(&ailp->xa_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
+                xfs_trans_ail_delete(ailp, &efip->efi_item);
+                xfs_efi_item_free(efip);
+        }
+}
+/*
 * This returns the number of iovecs needed to log the given efi item.
 * We only need 1 iovec for an efi item.  It just logs the efi_log_format
 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
        uint                    size;
-        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+        ASSERT(atomic_read(&efip->efi_next_extent) ==
+                                efip->efi_format.efi_nextents);
        efip->efi_format.efi_type = XFS_LI_EFI;
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 }
 /*
- * While EFIs cannot really be pinned, the unpin operation is the
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
- * last place at which the EFI is manipulated during a transaction.
+ * which the EFI is manipulated during a transaction.  If we are being asked to
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * remove the EFI it's because the transaction has been cancelled and by
- * free the EFI.
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
 */
 STATIC void
 xfs_efi_item_unpin(
@@ -110,20 +135,14 @@ xfs_efi_item_unpin(
        int                     remove)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-        struct xfs_ail          *ailp = lip->li_ailp;
-        spin_lock(&ailp->xa_lock);
-        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                if (remove)
-                        xfs_trans_del_item(lip);
-                /* xfs_trans_ail_delete() drops the AIL lock. */
+        if (remove) {
-                xfs_trans_ail_delete(ailp, lip);
+                ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+                xfs_trans_del_item(lip);
                xfs_efi_item_free(efip);
-        } else {
+                return;
-                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&ailp->xa_lock);
        }
+        __xfs_efi_release(efip);
 }
 /*
@@ -152,16 +171,20 @@ xfs_efi_item_unlock(
 }
 /*
- * The EFI is logged only once and cannot be moved in the log, so
+ * The EFI is logged only once and cannot be moved in the log, so simply return
- * simply return the lsn at which it's been logged.  The canceled
+ * the lsn at which it's been logged.  For bulk transaction committed
- * flag is not paid any attention here.  Checking for that is delayed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * until the EFI is unpinned.
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
 */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
        return lsn;
 }
@@ -230,6 +253,7 @@ xfs_efi_init(
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
+        atomic_set(&efip->efi_next_extent, 0);
        return efip;
 }
@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 /*
- * This is called by the efd item code below to release references to
+ * This is called by the efd item code below to release references to the given
- * the given efi item.  Each efd calls this with the number of
+ * efi item.  Each efd calls this with the number of extents that it has
- * extents that it has logged, and when the sum of these reaches
+ * logged, and when the sum of these reaches the total number of extents logged
- * the total number of extents logged by this efi item we can free
+ * by this efi item we can free the efi item.
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
 */
 void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-        int                     extents_left;
+        if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+                __xfs_efi_release(efip);
-        ASSERT(efip->efi_next_extent > 0);
-        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&ailp->xa_lock);
-        ASSERT(efip->efi_next_extent >= nextents);
-        efip->efi_next_extent -= nextents;
-        extents_left = efip->efi_next_extent;
-        if (extents_left == 0) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                xfs_efi_item_free(efip);
-        } else {
-                spin_unlock(&ailp->xa_lock);
-        }
 }
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 #define XFS_EFI_MAX_FAST_EXTENTS        16
 /*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
 */
-#define XFS_EFI_RECOVERED       0x1
+#define XFS_EFI_RECOVERED       1
-#define XFS_EFI_COMMITTED       0x2
+#define XFS_EFI_COMMITTED       2
-#define XFS_EFI_CANCELED        0x4
 /*
 * This is the "extent free intention" log item.  It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
 */
 typedef struct xfs_efi_log_item {
        xfs_log_item_t          efi_item;
-        uint                    efi_flags;      /* misc flags */
+        atomic_t                efi_next_extent;
-        uint                    efi_next_extent;
+        unsigned long           efi_flags;      /* misc flags */
        xfs_efi_log_format_t    efi_format;
 } xfs_efi_log_item_t;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814af..cec89dd5d7d2 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+        xfs_set_low_space_thresholds(mp);
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
@@ -611,12 +612,13 @@ out:
 *
 * We cannot use an inode here for this - that will push dirty state back up
 * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead.
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
 */
 int
 xfs_fs_log_dummy(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             flags)
 {
        xfs_trans_t     *tp;
        int             error;
@@ -631,8 +633,7 @@ xfs_fs_log_dummy(
        /* log the UUID because it is an unchanging field */
        xfs_mod_sb(tp, XFS_SB_UUID);
-        if (flags & SYNC_WAIT)
+        xfs_trans_set_sync(tp);
-                xfs_trans_set_sync(tp);
        return xfs_trans_commit(tp, 0);
 }
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index d7de5a3f7867..cb9b6d1469f7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
 /*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+/*
 * Allocate and initialise an xfs_inode.
 */
 STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(ip->i_ino == 0);
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_active, "xfs_iolock_active");
        /* initialise the xfs inode */
        ip->i_ino = ino;
@@ -85,9 +99,6 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
-        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
@@ -145,7 +156,18 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 /*
@@ -155,14 +177,29 @@ static int
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
        int                     flags,
-        int                     lock_flags) __releases(pag->pag_ici_lock)
+        int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
        /*
         * If we are racing with another cache hit that is currently
@@ -205,7 +242,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -213,7 +250,7 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
                        ip->i_flags &= ~XFS_INEW;
@@ -223,14 +260,20 @@ xfs_iget_cache_hit(
                        goto out_error;
                }
-                write_lock(&pag->pag_ici_lock);
+                spin_lock(&pag->pag_ici_lock);
                spin_lock(&ip->i_flags_lock);
                ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
                ip->i_flags |= XFS_INEW;
                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
                inode->i_state = I_NEW;
+                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+                lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                                &xfs_iolock_active, "xfs_iolock_active");
                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
+                spin_unlock(&pag->pag_ici_lock);
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -241,7 +284,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
@@ -255,7 +298,7 @@ xfs_iget_cache_hit(
 out_error:
        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -308,7 +351,7 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -323,14 +366,14 @@ xfs_iget_cache_miss(
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        *ipp = ip;
        return 0;
 out_preload_end:
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
@@ -377,7 +420,7 @@ xfs_iget(
        xfs_agino_t     agino;
        /* reject inode numbers outside existing AGs */
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
@@ -386,15 +429,15 @@ xfs_iget(
 again:
        error = 0;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f94..be7cf625421f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-        XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        xfs_buf_set_ref(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or stale, nothing to do */
+                        /* Inode not in memory, nothing to do */
-                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+                        if (!ip) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                continue;
                        }
                        /*
+                         * because this is an RCU protected lookup, we could
+                         * find a recently freed or even reallocated inode
+                         * during the lookup. We need to check under the
+                         * i_flags_lock for a valid inode here. Skip it if it
+                         * is not valid, the wrong inode or stale.
+                         */
+                        spin_lock(&ip->i_flags_lock);
+                        if (ip->i_ino != inum + i ||
+                            __xfs_iflags_test(ip, XFS_ISTALE)) {
+                                spin_unlock(&ip->i_flags_lock);
+                                rcu_read_unlock();
+                                continue;
+                        }
+                        spin_unlock(&ip->i_flags_lock);
+                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-                /* if the inode lies outside this cluster, we're done. */
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                /*
-                        break;
+                 * because this is an RCU protected lookup, we could find a
+                 * recently freed or even reallocated inode during the lookup.
+                 * We need to check under the i_flags_lock for a valid inode
+                 * here. Skip it if it is not valid or the wrong inode.
+                 */
+                spin_lock(&ip->i_flags_lock);
+                if (!ip->i_ino ||
+                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                        spin_unlock(&ip->i_flags_lock);
+                        continue;
+                }
+                spin_unlock(&ip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
        }
 out_free:
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc9..5c95fa8ec11d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * In-core inode flags.
 */
-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
+#define XFS_IRECLAIM            0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE      0x0002  /* inode has been staled */
+#define XFS_ISTALE              0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_IRECLAIMABLE        0x0004  /* inode can be reclaimed */
-#define XFS_INEW        0x0008  /* inode has just been allocated */
+#define XFS_INEW                0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
+#define XFS_IFILESTREAM         0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
+#define XFS_ITRUNCATED          0x0020  /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE      0x0040  /* dirty release already seen */
 /*
 * Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+extern struct lock_class_key xfs_iolock_reclaimable;
 /*
 * Flags for xfs_itruncate_start().
 */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c8d30c453c3..fd4f398bd6f1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
 * flushed to disk.  It is responsible for removing the inode item
 * from the AIL if it has not been re-logged, and unlocking the inode's
 * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
 */
 void
 xfs_iflush_done(
        struct xfs_buf          *bp,
        struct xfs_log_item     *lip)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode_log_item *iip;
-        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_log_item     *blip;
+        struct xfs_log_item     *next;
+        struct xfs_log_item     *prev;
        struct xfs_ail          *ailp = lip->li_ailp;
+        int                     need_ail = 0;
+        /*
+         * Scan the buffer IO completions for other inodes being completed and
+         * attach them to the current inode log item.
+         */
+        blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        prev = NULL;
+        while (blip != NULL) {
+                if (lip->li_cb != xfs_iflush_done) {
+                        prev = blip;
+                        blip = blip->li_bio_list;
+                        continue;
+                }
+                /* remove from list */
+                next = blip->li_bio_list;
+                if (!prev) {
+                        XFS_BUF_SET_FSPRIVATE(bp, next);
+                } else {
+                        prev->li_bio_list = next;
+                }
+                /* add to current list */
+                blip->li_bio_list = lip->li_bio_list;
+                lip->li_bio_list = blip;
+                /*
+                 * while we have the item, do the unlocked check for needing
+                 * the AIL lock.
+                 */
+                iip = INODE_ITEM(blip);
+                if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+                        need_ail++;
+                blip = next;
+        }
+        /* make sure we capture the state of the initial inode. */
+        iip = INODE_ITEM(lip);
+        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+                need_ail++;
        /*
         * We only want to pull the item from the AIL if it is
@@ -861,28 +910,37 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+        if (need_ail) {
+                struct xfs_log_item *log_items[need_ail];
+                int i = 0;
                spin_lock(&ailp->xa_lock);
-                if (lip->li_lsn == iip->ili_flush_lsn) {
+                for (blip = lip; blip; blip = blip->li_bio_list) {
-                        /* xfs_trans_ail_delete() drops the AIL lock. */
+                        iip = INODE_ITEM(blip);
-                        xfs_trans_ail_delete(ailp, lip);
+                        if (iip->ili_logged &&
-                } else {
+                            blip->li_lsn == iip->ili_flush_lsn) {
-                        spin_unlock(&ailp->xa_lock);
+                                log_items[i++] = blip;
+                        }
+                        ASSERT(i <= need_ail);
                }
+                /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+                xfs_trans_ail_delete_bulk(ailp, log_items, i);
        }
-        iip->ili_logged = 0;
        /*
-         * Clear the ili_last_fields bits now that we know that the
+         * clean up and unlock the flush lock now we are done. We can clear the
-         * data corresponding to them is safely on disk.
+         * ili_last_fields bits now that we know that the data corresponding to
+         * them is safely on disk.
         */
-        iip->ili_last_fields = 0;
+        for (blip = lip; blip; blip = next) {
+                next = blip->li_bio_list;
+                blip->li_bio_list = NULL;
-        /*
+                iip = INODE_ITEM(blip);
-         * Release the inode's flush lock since we're done with it.
+                iip->ili_logged = 0;
-         */
+                iip->ili_last_fields = 0;
-        xfs_ifunlock(ip);
+                xfs_ifunlock(iip->ili_inode);
+        }
 }
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..55582bd66659 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
-int
-xfs_iomap(
-        struct xfs_inode        *ip,
-        xfs_off_t               offset,
-        ssize_t                 count,
-        int                     flags,
-        struct xfs_bmbt_irec    *imap,
-        int                     *nimaps,
-        int                     *new)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           offset_fsb, end_fsb;
-        int                     error = 0;
-        int                     lockmode = 0;
-        int                     bmapi_flags = 0;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        *new = 0;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-        switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
-        case BMAPI_READ:
-                lockmode = xfs_ilock_map_shared(ip);
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                break;
-        case BMAPI_WRITE:
-                lockmode = XFS_ILOCK_EXCL;
-                if (flags & BMAPI_IGNSTATE)
-                        bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
-                xfs_ilock(ip, lockmode);
-                break;
-        case BMAPI_ALLOCATE:
-                lockmode = XFS_ILOCK_SHARED;
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                /* Attempt non-blocking lock */
-                if (flags & BMAPI_TRYLOCK) {
-                        if (!xfs_ilock_nowait(ip, lockmode))
-                                return XFS_ERROR(EAGAIN);
-                } else {
-                        xfs_ilock(ip, lockmode);
-                }
-                break;
-        default:
-                BUG();
-        }
-        ASSERT(offset <= mp->m_maxioffset);
-        if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-                count = mp->m_maxioffset - offset;
-        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_bmapi(NULL, ip, offset_fsb,
-                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, imap,
-                        nimaps, NULL);
-        if (error)
-                goto out;
-        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
-        case BMAPI_WRITE:
-                /* If we found an extent, return it */
-                if (*nimaps &&
-                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                if (flags & BMAPI_DIRECT) {
-                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       imap, nimaps);
-                } else {
-                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      imap, nimaps);
-                }
-                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
-                }
-                *new = 1;
-                break;
-        case BMAPI_ALLOCATE:
-                /* If we found an extent, return it */
-                xfs_iunlock(ip, lockmode);
-                lockmode = 0;
-                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 imap, nimaps);
-                break;
-        }
-        ASSERT(*nimaps <= 1);
-out:
-        if (lockmode)
-                xfs_iunlock(ip, lockmode);
-        return XFS_ERROR(error);
-}
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-STATIC int
+int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             flags,
        xfs_bmbt_irec_t *imap,
-        int             *nmaps)
+        int             nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+                if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        imap->br_blockcount +
                                        imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip);
        bmapi_flag = XFS_BMAPI_WRITE;
-        if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+        if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
                goto error_out;
        }
-        *nmaps = 1;
        return 0;
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-        *nmaps = 0;     /* nothing set-up here */
 error_out:
        return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
 * If the caller is doing a write at the end of the file, then extend the
 * allocation out to the file system's write iosize.  We clean up any extra
 * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
        xfs_bmbt_irec_t *imap,
        int             nimaps,
        int             *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
        xfs_filblks_t   count_fsb;
        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
+        int             found_delalloc = 0;
        *prealloc = 0;
        if ((offset + count) <= ip->i_size)
@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate(
                                return 0;
                        start_fsb += imap[n].br_blockcount;
                        count_fsb -= imap[n].br_blockcount;
+                        if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                                found_delalloc = 1;
                }
        }
-        *prealloc = 1;
+        if (!found_delalloc)
+                *prealloc = 1;
        return 0;
 }
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip)
+{
+        xfs_fsblock_t           alloc_blocks = 0;
+        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+                int shift = 0;
+                int64_t freesp;
+                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                        rounddown_pow_of_two(alloc_blocks));
+                xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+                freesp = mp->m_sb.sb_fdblocks;
+                if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                        shift = 2;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                                shift++;
+                }
+                if (shift)
+                        alloc_blocks >>= shift;
+        }
+        if (alloc_blocks < mp->m_writeio_blocks)
+                alloc_blocks = mp->m_writeio_blocks;
+        return alloc_blocks;
+}
+int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
+        xfs_bmbt_irec_t *ret_imap)
-        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -469,16 +396,19 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+                                imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
 retry:
        if (prealloc) {
+                xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-                last_fsb = ioalign + mp->m_writeio_blocks;
+                last_fsb = ioalign + alloc_blocks;
        } else {
                last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
@@ -496,22 +426,31 @@ retry:
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-        if (error && (error != ENOSPC))
+        switch (error) {
+        case 0:
+        case ENOSPC:
+        case EDQUOT:
+                break;
+        default:
                return XFS_ERROR(error);
+        }
        /*
-         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-         * then we must have run out of space - flush all other inodes with
+         * ENOSPC, * flush all other inodes with delalloc blocks to free up
-         * delalloc blocks and retry without EOF preallocation.
+         * some of the excess reserved metadata space. For both cases, retry
+         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (flushed)
-                        return XFS_ERROR(ENOSPC);
+                        return XFS_ERROR(error ? error : ENOSPC);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error == ENOSPC) {
-                xfs_flush_inodes(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_flush_inodes(ip);
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                }
                flushed = 1;
                error = 0;
@@ -523,8 +462,6 @@ retry:
                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
        *ret_imap = imap[0];
-        *nmaps = 1;
        return 0;
 }
@@ -538,13 +475,12 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-STATIC int
+int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        xfs_bmbt_irec_t *imap,
+        xfs_bmbt_irec_t *imap)
-        int             *retmap)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb, last_block;
@@ -557,8 +493,6 @@ xfs_iomap_write_allocate(
        int             error = 0;
        int             nres;
-        *retmap = 0;
        /*
         * Make sure that the dquots are there.
         */
@@ -680,7 +614,6 @@ xfs_iomap_write_allocate(
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
                                   imap->br_blockcount))) {
-                        *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
                }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-/* base extent manipulation calls */
-#define BMAPI_READ      (1 << 0)        /* read extents */
-#define BMAPI_WRITE     (1 << 1)        /* create extents */
-#define BMAPI_ALLOCATE  (1 << 2)        /* delayed allocate to real extents */
-/* modifiers */
-#define BMAPI_IGNSTATE  (1 << 4)        /* ignore unwritten state on read */
-#define BMAPI_DIRECT    (1 << 5)        /* direct instead of buffered write */
-#define BMAPI_MMA       (1 << 6)        /* allocate for mmap write */
-#define BMAPI_TRYLOCK   (1 << 7)        /* non-blocking request */
-#define BMAPI_FLAGS \
-        { BMAPI_READ,           "READ" }, \
-        { BMAPI_WRITE,          "WRITE" }, \
-        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
-        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
-        { BMAPI_DIRECT,         "DIRECT" }, \
-        { BMAPI_TRYLOCK,        "TRYLOCK" }
 struct xfs_inode;
 struct xfs_bmbt_irec;
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                     struct xfs_bmbt_irec *, int *, int *);
+                        struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9e..ae6fef1ff563 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
                                xfs_buftarg_t   *log_target,
                                xfs_daddr_t     blk_offset,
                                int             num_bblks);
-STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int       xlog_space_left(struct log *log, atomic64_t *head);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t         *log,
                                 xlog_ticket_t  *xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t     *mp,
+STATIC void xlog_grant_push_ail(struct log      *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
                                           xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void     xlog_verify_grant_tail(struct log *log);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
 STATIC void     xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
                                     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        if (*qp) {
+        int64_t head_val = atomic64_read(head);
-                tic->t_next         = (*qp);
+        int64_t new, old;
-                tic->t_prev         = (*qp)->t_prev;
-                (*qp)->t_prev->t_next = tic;
-                (*qp)->t_prev       = tic;
-        } else {
-                tic->t_prev = tic->t_next = tic;
-                *qp = tic;
-        }
-        tic->t_flags |= XLOG_TIC_IN_Q;
+        do {
-}
+                int     cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
-        if (tic == tic->t_next) {
-                *qp = NULL;
-        } else {
-                *qp = tic->t_next;
-                tic->t_next->t_prev = tic->t_prev;
-                tic->t_prev->t_next = tic->t_next;
-        }
-        tic->t_next = tic->t_prev = NULL;
+                space -= bytes;
-        tic->t_flags &= ~XLOG_TIC_IN_Q;
+                if (space < 0) {
+                        space += log->l_logsize;
+                        cycle--;
+                }
+                old = head_val;
+                new = xlog_assign_grant_head_val(cycle, space);
+                head_val = atomic64_cmpxchg(head, old, new);
+        } while (head_val != old);
 }
 static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        log->l_grant_write_bytes -= bytes;
+        int64_t head_val = atomic64_read(head);
-        if (log->l_grant_write_bytes < 0) {
+        int64_t new, old;
-                log->l_grant_write_bytes += log->l_logsize;
-                log->l_grant_write_cycle--;
-        }
-        log->l_grant_reserve_bytes -= bytes;
-        if ((log)->l_grant_reserve_bytes < 0) {
-                log->l_grant_reserve_bytes += log->l_logsize;
-                log->l_grant_reserve_cycle--;
-        }
-}
+        do {
+                int             tmp;
+                int             cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_grant_add_space_write(struct log *log, int bytes)
-{
-        int tmp = log->l_logsize - log->l_grant_write_bytes;
-        if (tmp > bytes)
-                log->l_grant_write_bytes += bytes;
-        else {
-                log->l_grant_write_cycle++;
-                log->l_grant_write_bytes = bytes - tmp;
-        }
-}
-static void
+                tmp = log->l_logsize - space;
-xlog_grant_add_space_reserve(struct log *log, int bytes)
+                if (tmp > bytes)
-{
+                        space += bytes;
-        int tmp = log->l_logsize - log->l_grant_reserve_bytes;
+                else {
-        if (tmp > bytes)
+                        space = bytes - tmp;
-                log->l_grant_reserve_bytes += bytes;
+                        cycle++;
-        else {
+                }
-                log->l_grant_reserve_cycle++;
-                log->l_grant_reserve_bytes = bytes - tmp;
-        }
-}
-static inline void
+                old = head_val;
-xlog_grant_add_space(struct log *log, int bytes)
+                new = xlog_assign_grant_head_val(cycle, space);
-{
+                head_val = atomic64_cmpxchg(head, old, new);
-        xlog_grant_add_space_write(log, bytes);
+        } while (head_val != old);
-        xlog_grant_add_space_reserve(log, bytes);
 }
 static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+                xlog_grant_push_ail(log, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp,
+                xlog_grant_push_ail(log,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
                retval = xlog_grant_log_space(log, internal_ticket);
@@ -402,7 +377,7 @@ xfs_log_mount(
                cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
        else {
                cmn_err(CE_NOTE,
-                        "!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+                        "Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
                        mp->m_fsname);
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                        } else {
                                spin_unlock(&log->l_icloglock);
                        }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                } else {
                        spin_unlock(&log->l_icloglock);
                }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 {
        xlog_ticket_t   *tic;
        xlog_t          *log = mp->m_log;
-        int             need_bytes, free_bytes, cycle, bytes;
+        int             need_bytes, free_bytes;
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
-        if (tail_lsn == 0) {
+        if (tail_lsn == 0)
-                /* needed since sync_lsn is 64 bits */
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                spin_lock(&log->l_icloglock);
-                tail_lsn = log->l_last_sync_lsn;
-                spin_unlock(&log->l_icloglock);
-        }
-        spin_lock(&log->l_grant_lock);
-        /* Also an invalid lsn.  1 implies that we aren't passing in a valid
+        /* tail_lsn == 1 implies that we weren't passed a valid value.  */
-         * tail_lsn.
+        if (tail_lsn != 1)
-         */
+                atomic64_set(&log->l_tail_lsn, tail_lsn);
-        if (tail_lsn != 1) {
-                log->l_tail_lsn = tail_lsn;
-        }
-        if ((tic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_write_cycle;
+                spin_lock(&log->l_grant_write_lock);
-                bytes = log->l_grant_write_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_writeq, t_queue) {
-                do {
                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_regrant_write_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_write_headq);
+                }
+                spin_unlock(&log->l_grant_write_lock);
        }
-        if ((tic = log->l_reserve_headq)) {
+        if (!list_empty_careful(&log->l_reserveq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_reserve_cycle;
+                spin_lock(&log->l_grant_reserve_lock);
-                bytes = log->l_grant_reserve_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_reserveq, t_queue) {
-                do {
                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
                                need_bytes = tic->t_unit_res*tic->t_cnt;
                        else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_grant_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_reserve_headq);
+                }
+                spin_unlock(&log->l_grant_reserve_lock);
        }
-        spin_unlock(&log->l_grant_lock);
+}
-}       /* xfs_log_move_tail */
 /*
 * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
 * We may be holding the log iclog lock upon entering this routine.
 */
 xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+        struct xfs_mount        *mp)
 {
-        xfs_lsn_t tail_lsn;
+        xfs_lsn_t               tail_lsn;
-        xlog_t    *log = mp->m_log;
+        struct log              *log = mp->m_log;
        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
-        spin_lock(&log->l_grant_lock);
+        if (!tail_lsn)
-        if (tail_lsn != 0) {
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                log->l_tail_lsn = tail_lsn;
-        } else {
-                tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
-        }
-        spin_unlock(&log->l_grant_lock);
+        atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
-}       /* xlog_assign_tail_lsn */
+}
 /*
 * Return the space in the log between the tail and the head.  The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
 * result is that we return the size of the log as the amount of space left.
 */
 STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
+xlog_space_left(
-{
+        struct log      *log,
-        int free_bytes;
+        atomic64_t      *head)
-        int tail_bytes;
+{
-        int tail_cycle;
+        int             free_bytes;
+        int             tail_bytes;
-        tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
+        int             tail_cycle;
-        tail_cycle = CYCLE_LSN(log->l_tail_lsn);
+        int             head_cycle;
-        if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+        int             head_bytes;
-                free_bytes = log->l_logsize - (bytes - tail_bytes);
-        } else if ((tail_cycle + 1) < cycle) {
+        xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+        tail_bytes = BBTOB(tail_bytes);
+        if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+                free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+        else if (tail_cycle + 1 < head_cycle)
                return 0;
-        } else if (tail_cycle < cycle) {
+        else if (tail_cycle < head_cycle) {
-                ASSERT(tail_cycle == (cycle - 1));
+                ASSERT(tail_cycle == (head_cycle - 1));
-                free_bytes = tail_bytes - bytes;
+                free_bytes = tail_bytes - head_bytes;
        } else {
                /*
                 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
-                        tail_cycle, tail_bytes, cycle, bytes);
+                        tail_cycle, tail_bytes, head_cycle, head_bytes);
                ASSERT(0);
                free_bytes = log->l_logsize;
        }
        return free_bytes;
-}       /* xlog_space_left */
+}
 /*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
        log->l_prev_block  = -1;
-        log->l_tail_lsn    = xlog_assign_lsn(1, 0);
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-        log->l_last_sync_lsn = log->l_tail_lsn;
+        xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+        xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-        log->l_grant_reserve_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-        log->l_grant_write_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+        INIT_LIST_HEAD(&log->l_reserveq);
+        INIT_LIST_HEAD(&log->l_writeq);
+        spin_lock_init(&log->l_grant_reserve_lock);
+        spin_lock_init(&log->l_grant_write_lock);
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_xbuf = bp;
        spin_lock_init(&log->l_icloglock);
-        spin_lock_init(&log->l_grant_lock);
+        init_waitqueue_head(&log->l_flush_wait);
-        sv_init(&log->l_flush_wait, 0, "flush_wait");
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+                init_waitqueue_head(&iclog->ic_force_wait);
-                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                init_waitqueue_head(&iclog->ic_write_wait);
                iclogp = &iclog->ic_next;
        }
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t	*mp,
 out_free_iclog:
        for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
                prev_iclog = iclog->ic_next;
-                if (iclog->ic_bp) {
+                if (iclog->ic_bp)
-                        sv_destroy(&iclog->ic_force_wait);
-                        sv_destroy(&iclog->ic_write_wait);
                        xfs_buf_free(iclog->ic_bp);
-                }
                kmem_free(iclog);
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
 * water mark.  In this manner, we would be creating a low water mark.
 */
 STATIC void
-xlog_grant_push_ail(xfs_mount_t *mp,
+xlog_grant_push_ail(
-                    int         need_bytes)
+        struct log      *log,
+        int             need_bytes)
 {
-    xlog_t      *log = mp->m_log;       /* pointer to the log */
+        xfs_lsn_t       threshold_lsn = 0;
-    xfs_lsn_t   tail_lsn;               /* lsn of the log tail */
+        xfs_lsn_t       last_sync_lsn;
-    xfs_lsn_t   threshold_lsn = 0;      /* lsn we'd like to be at */
+        int             free_blocks;
-    int         free_blocks;            /* free blocks left to write to */
+        int             free_bytes;
-    int         free_bytes;             /* free bytes left to write to */
+        int             threshold_block;
-    int         threshold_block;        /* block in lsn we'd like to be at */
+        int             threshold_cycle;
-    int         threshold_cycle;        /* lsn cycle we'd like to be at */
+        int             free_threshold;
-    int         free_threshold;
+        ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-    spin_lock(&log->l_grant_lock);
+        free_blocks = BTOBBT(free_bytes);
-    free_bytes = xlog_space_left(log,
-                                 log->l_grant_reserve_cycle,
+        /*
-                                 log->l_grant_reserve_bytes);
+         * Set the threshold for the minimum number of free blocks in the
-    tail_lsn = log->l_tail_lsn;
+         * log to the maximum of what the caller needs, one quarter of the
-    free_blocks = BTOBBT(free_bytes);
+         * log, and 256 blocks.
+         */
-    /*
+        free_threshold = BTOBB(need_bytes);
-     * Set the threshold for the minimum number of free blocks in the
+        free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-     * log to the maximum of what the caller needs, one quarter of the
+        free_threshold = MAX(free_threshold, 256);
-     * log, and 256 blocks.
+        if (free_blocks >= free_threshold)
-     */
+                return;
-    free_threshold = BTOBB(need_bytes);
-    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
-    free_threshold = MAX(free_threshold, 256);
+                                                &threshold_block);
-    if (free_blocks < free_threshold) {
+        threshold_block += free_threshold;
-        threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
-        threshold_cycle = CYCLE_LSN(tail_lsn);
        if (threshold_block >= log->l_logBBsize) {
-            threshold_block -= log->l_logBBsize;
+                threshold_block -= log->l_logBBsize;
-            threshold_cycle += 1;
+                threshold_cycle += 1;
        }
-        threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+        threshold_lsn = xlog_assign_lsn(threshold_cycle,
+                                        threshold_block);
+        /*
+         * Don't pass in an lsn greater than the lsn of the last
+         * log record known to be on disk. Use a snapshot of the last sync lsn
+         * so that it doesn't change between the compare and the set.
+         */
+        last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+        if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+                threshold_lsn = last_sync_lsn;
-        /* Don't pass in an lsn greater than the lsn of the last
+        /*
-         * log record known to be on disk.
+         * Get the transaction layer to kick the dirty buffers out to
+         * disk asynchronously. No point in trying to do this if
+         * the filesystem is shutting down.
         */
-        if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
+        if (!XLOG_FORCED_SHUTDOWN(log))
-            threshold_lsn = log->l_last_sync_lsn;
+                xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-    }
+}
-    spin_unlock(&log->l_grant_lock);
-    /*
-     * Get the transaction layer to kick the dirty buffers out to
-     * disk asynchronously. No point in trying to do this if
-     * the filesystem is shutting down.
-     */
-    if (threshold_lsn &&
-        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}       /* xlog_grant_push_ail */
 /*
 * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t		*log,
                 roundoff < BBTOB(1)));
        /* move grant heads by roundoff in sync */
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
-        xlog_grant_add_space(log, roundoff);
+        xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
-        spin_unlock(&log->l_grant_lock);
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
                                lowest_lsn = xlog_get_lowest_lsn(log);
                                if (lowest_lsn &&
                                    XFS_LSN_CMP(lowest_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
                                        iclog = iclog->ic_next;
                                        continue; /* Leave this iclog for
                                                   * another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
                                iclog->ic_state = XLOG_STATE_CALLBACK;
-                                spin_unlock(&log->l_icloglock);
-                                /* l_last_sync_lsn field protected by
+                                /*
-                                 * l_grant_lock. Don't worry about iclog's lsn.
+                                 * update the last_sync_lsn before we drop the
-                                 * No one else can be here except us.
+                                 * icloglock to ensure we are the only one that
+                                 * can update it.
                                 */
-                                spin_lock(&log->l_grant_lock);
+                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-                                ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
+                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+                                atomic64_set(&log->l_last_sync_lsn,
-                                log->l_last_sync_lsn =
+                                        be64_to_cpu(iclog->ic_header.h_lsn));
-                                        be64_to_cpu(iclog->ic_header.h_lsn);
-                                spin_unlock(&log->l_grant_lock);
-                        } else {
+                        } else
-                                spin_unlock(&log->l_icloglock);
                                ioerrors++;
-                        }
+                        spin_unlock(&log->l_icloglock);
                        /*
                         * Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_force_wait);
+                        wake_up_all(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
        spin_unlock(&log->l_icloglock);
        if (wake)
-                sv_broadcast(&log->l_flush_wait);
+                wake_up_all(&log->l_flush_wait);
 }
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_write_wait);
+        wake_up_all(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
                XFS_STATS_INC(xs_log_noiclogs);
                /* Wait for log writes to have flushed */
-                sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+                xlog_wait(&log->l_flush_wait, &log->l_icloglock);
                goto restart;
        }
@@ -2527,6 +2486,18 @@ restart:
 *
 * Once a ticket gets put onto the reserveq, it will only return after
 * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
 */
 STATIC int
 xlog_grant_log_space(xlog_t        *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t	   *log,
 {
        int              free_bytes;
        int              need_bytes;
-#ifdef DEBUG
-        xfs_lsn_t        tail_lsn;
-#endif
 #ifdef DEBUG
        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                panic("grant Recovery problem");
 #endif
-        /* Is there space or do we need to sleep? */
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_grant_enter(log, tic);
+        need_bytes = tic->t_unit_res;
+        if (tic->t_flags & XFS_LOG_PERM_RESERV)
+                need_bytes *= tic->t_ocnt;
        /* something is already sleeping; insert new transaction at end */
-        if (log->l_reserve_headq) {
+        if (!list_empty_careful(&log->l_reserveq)) {
-                xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                spin_lock(&log->l_grant_reserve_lock);
+                /* recheck the queue now we are locked */
+                if (list_empty(&log->l_reserveq)) {
+                        spin_unlock(&log->l_grant_reserve_lock);
+                        goto redo;
+                }
+                list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep1(log, tic);
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
                 */
                trace_xfs_log_grant_wake1(log, tic);
-                spin_lock(&log->l_grant_lock);
        }
-        if (tic->t_flags & XFS_LOG_PERM_RESERV)
-                need_bytes = tic->t_unit_res*tic->t_ocnt;
-        else
-                need_bytes = tic->t_unit_res;
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                                     log->l_grant_reserve_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_reserve_lock);
-                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                if (list_empty(&tic->t_queue))
+                        list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep2(log, tic);
-                spin_unlock(&log->l_grant_lock);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                trace_xfs_log_grant_wake2(log, tic);
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space(log, need_bytes);
+                spin_lock(&log->l_grant_reserve_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_reserve_lock);
-        /*
-         * Check to make sure the grant write head didn't just over lap the
-         * tail.  If the cycles are the same, we can't be overlapping.
-         * Otherwise, make sure that the cycles differ by exactly one and
-         * check the byte count.
-         */
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
-        xlog_verify_grant_head(log, 1);
+        xlog_verify_grant_tail(log);
-        spin_unlock(&log->l_grant_lock);
        return 0;
- error_return:
+error_return_unlocked:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        spin_lock(&log->l_grant_reserve_lock);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+error_return:
+        list_del_init(&tic->t_queue);
+        spin_unlock(&log->l_grant_reserve_lock);
        trace_xfs_log_grant_error(log, tic);
        /*
@@ -2638,7 +2597,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_grant_log_space */
@@ -2646,17 +2604,14 @@ redo:
 /*
 * Replenish the byte reservation required by moving the grant write head.
 *
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
 */
 STATIC int
 xlog_regrant_write_log_space(xlog_t        *log,
                             xlog_ticket_t *tic)
 {
        int             free_bytes, need_bytes;
-        xlog_ticket_t   *ntic;
-#ifdef DEBUG
-        xfs_lsn_t       tail_lsn;
-#endif
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                panic("regrant Recovery problem");
 #endif
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_regrant_write_enter(log, tic);
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
        /* If there are other waiters on the queue then give them a
         * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t	   *log,
         * this transaction.
         */
        need_bytes = tic->t_unit_res;
-        if ((ntic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
-                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+                struct xlog_ticket *ntic;
-                                             log->l_grant_write_bytes);
-                do {
+                spin_lock(&log->l_grant_write_lock);
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+                list_for_each_entry(ntic, &log->l_writeq, t_queue) {
                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_wait);
+                        wake_up(&ntic->t_wait);
-                        ntic = ntic->t_next;
+                }
-                } while (ntic != log->l_write_headq);
-                if (ntic != log->l_write_headq) {
-                        if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                                xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (ntic != list_first_entry(&log->l_writeq,
+                                                struct xlog_ticket, t_queue)) {
+                        if (list_empty(&tic->t_queue))
+                                list_add_tail(&tic->t_queue, &log->l_writeq);
                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                        spin_unlock(&log->l_grant_lock);
+                        xlog_grant_push_ail(log, need_bytes);
-                        xlog_grant_push_ail(log->l_mp, need_bytes);
-                        spin_lock(&log->l_grant_lock);
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
+                        xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                                &log->l_grant_lock, s);
-                        /* If we're shutting down, this tic is already
-                         * off the queue */
-                        spin_lock(&log->l_grant_lock);
-                        if (XLOG_FORCED_SHUTDOWN(log))
-                                goto error_return;
                        trace_xfs_log_regrant_write_wake1(log, tic);
-                }
+                } else
+                        spin_unlock(&log->l_grant_write_lock);
        }
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                                     log->l_grant_write_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_write_lock);
-                        xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (list_empty(&tic->t_queue))
-                spin_unlock(&log->l_grant_lock);
+                        list_add_tail(&tic->t_queue, &log->l_writeq);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep2(log, tic);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                /* If we're shutting down, this tic is already off the queue */
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep2(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
                trace_xfs_log_regrant_write_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_write_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space_write(log, need_bytes);
+                spin_lock(&log->l_grant_write_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_write_lock);
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
+        xlog_verify_grant_tail(log);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        return 0;
+ error_return_unlocked:
+        spin_lock(&log->l_grant_write_lock);
 error_return:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        list_del_init(&tic->t_queue);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+        spin_unlock(&log->l_grant_write_lock);
        trace_xfs_log_regrant_write_error(log, tic);
        /*
@@ -2778,7 +2714,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_regrant_write_log_space */
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head,
-        xlog_grant_sub_space(log, ticket->t_curr_res);
+                                        ticket->t_curr_res);
+        xlog_grant_sub_space(log, &log->l_grant_write_head,
+                                        ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
        trace_xfs_log_regrant_reserve_sub(log, ticket);
-        xlog_verify_grant_head(log, 1);
        /* just return if we still have some of the pre-reserved space */
-        if (ticket->t_cnt > 0) {
+        if (ticket->t_cnt > 0)
-                spin_unlock(&log->l_grant_lock);
                return;
-        }
-        xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head,
+                                        ticket->t_unit_res);
        trace_xfs_log_regrant_reserve_exit(log, ticket);
-        xlog_verify_grant_head(log, 0);
-        spin_unlock(&log->l_grant_lock);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 }       /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
 xlog_ungrant_log_space(xlog_t        *log,
                       xlog_ticket_t *ticket)
 {
+        int     bytes;
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_ungrant_enter(log, ticket);
-        xlog_grant_sub_space(log, ticket->t_curr_res);
        trace_xfs_log_ungrant_sub(log, ticket);
-        /* If this is a permanent reservation ticket, we may be able to free
+        /*
+         * If this is a permanent reservation ticket, we may be able to free
         * up more space based on the remaining count.
         */
+        bytes = ticket->t_curr_res;
        if (ticket->t_cnt > 0) {
                ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-                xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+                bytes += ticket->t_unit_res*ticket->t_cnt;
        }
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+        xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
        trace_xfs_log_ungrant_exit(log, ticket);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        xfs_log_move_tail(log->l_mp, 1);
 }       /* xlog_ungrant_log_space */
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
                /* update tail before writing to iclog */
-                xlog_assign_tail_lsn(log->l_mp);
+                xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
-                iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
+                iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
-                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+                xlog_verify_tail_lsn(log, iclog, tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+                xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
                                XFS_STATS_INC(xs_log_force_sleep);
-                                sv_wait(&iclog->ic_prev->ic_write_wait,
+                                xlog_wait(&iclog->ic_prev->ic_write_wait,
-                                        PSWP, &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                                if (log_flushed)
                                        *log_flushed = 1;
                                already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
                                return XFS_ERROR(EIO);
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+                        xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                        /*
                         * No need to grab the log lock here since we're
                         * only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
        xlog_ticket_t   *ticket)
 {
        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        if (atomic_dec_and_test(&ticket->t_ref)) {
+        if (atomic_dec_and_test(&ticket->t_ref))
-                sv_destroy(&ticket->t_wait);
                kmem_zone_free(xfs_log_ticket_zone, ticket);
-        }
 }
 xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
        }
        atomic_set(&tic->t_ref, 1);
+        INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+        init_waitqueue_head(&tic->t_wait);
        xlog_tic_reset_res(tic);
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
 }
 STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
+        struct log      *log)
 {
-    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+        int             tail_cycle, tail_blocks;
-        if (equals)
+        int             cycle, space;
-            ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
-        else
+        /*
-            ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+         * Check to make sure the grant write head didn't just over lap the
-    } else {
+         * tail.  If the cycles are the same, we can't be overlapping.
-        ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+         * Otherwise, make sure that the cycles differ by exactly one and
-        ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+         * check the byte count.
-    }
+         */
-}       /* xlog_verify_grant_head */
+        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+        if (tail_cycle != cycle) {
+                ASSERT(cycle - 1 == tail_cycle);
+                ASSERT(space <= BBTOB(tail_blocks));
+        }
+}
 /* check if it will fit */
 STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
                xlog_cil_force(log);
        /*
-         * We must hold both the GRANT lock and the LOG lock,
+         * mark the filesystem and the as in a shutdown state and wake
-         * before we mark the filesystem SHUTDOWN and wake
+         * everybody up to tell them the bad news.
-         * everybody up to tell the bad news.
         */
        spin_lock(&log->l_icloglock);
-        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        if (mp->m_sb_bp)
                XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
        spin_unlock(&log->l_icloglock);
        /*
-         * We don't want anybody waiting for log reservations
+         * We don't want anybody waiting for log reservations after this. That
-         * after this. That means we have to wake up everybody
+         * means we have to wake up everybody queued up on reserveq as well as
-         * queued up on reserve_headq as well as write_headq.
+         * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
-         * In addition, we make sure in xlog_{re}grant_log_space
+         * we don't enqueue anything once the SHUTDOWN flag is set, and this
-         * that we don't enqueue anything once the SHUTDOWN flag
+         * action is protected by the grant locks.
-         * is set, and this action is protected by the GRANTLOCK.
         */
-        if ((tic = log->l_reserve_headq)) {
+        spin_lock(&log->l_grant_reserve_lock);
-                do {
+        list_for_each_entry(tic, &log->l_reserveq, t_queue)
-                        sv_signal(&tic->t_wait);
+                wake_up(&tic->t_wait);
-                        tic = tic->t_next;
+        spin_unlock(&log->l_grant_reserve_lock);
-                } while (tic != log->l_reserve_headq);
-        }
+        spin_lock(&log->l_grant_write_lock);
+        list_for_each_entry(tic, &log->l_writeq, t_queue)
-        if ((tic = log->l_write_headq)) {
+                wake_up(&tic->t_wait);
-                do {
+        spin_unlock(&log->l_grant_write_lock);
-                        sv_signal(&tic->t_wait);
-                        tic = tic->t_next;
-                } while (tic != log->l_write_headq);
-        }
-        spin_unlock(&log->l_grant_lock);
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97b..9dc8125d04e5 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
        init_rwsem(&cil->xc_ctx_lock);
-        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        init_waitqueue_head(&cil->xc_commit_wait);
        INIT_LIST_HEAD(&ctx->committing);
        INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-        struct xfs_log_vec      *lv;
-        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
        struct xfs_busy_extent  *busyp, *n;
-        /* unpin all the log items */
+        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
-        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                                        ctx->start_lsn, abort);
-                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
-                                                        abortflag);
-        }
        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -568,7 +563,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
        }
@@ -592,7 +587,7 @@ restart:
         */
        spin_lock(&cil->xc_cil_lock);
        ctx->commit_lsn = commit_lsn;
-        sv_broadcast(&cil->xc_commit_wait);
+        wake_up_all(&cil->xc_commit_wait);
        spin_unlock(&cil->xc_cil_lock);
        /* release the hounds! */
@@ -757,7 +752,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
                if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..d5f8be8f4bf6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
-struct xfs_buf_cancel;
 struct xfs_mount;
 /*
@@ -54,7 +53,6 @@ struct xfs_mount;
        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
        return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
 */
 #define XLOG_TIC_INITED         0x1     /* has been initialized */
 #define XLOG_TIC_PERM_RESERV    0x2     /* permanent reservation */
-#define XLOG_TIC_IN_Q           0x4
 #define XLOG_TIC_FLAGS \
        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
-        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
+        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
-        { XLOG_TIC_IN_Q,        "XLOG_TIC_IN_Q" }
 #endif  /* __KERNEL__ */
@@ -244,9 +240,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_wait;       /* ticket wait queue            : 20 */
+        wait_queue_head_t  t_wait;       /* ticket wait queue */
-        struct xlog_ticket *t_next;      /*                              :4|8 */
+        struct list_head   t_queue;      /* reserve/write queue */
-        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_in_core {
-        sv_t                    ic_force_wait;
+        wait_queue_head_t       ic_force_wait;
-        sv_t                    ic_write_wait;
+        wait_queue_head_t       ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
        struct xfs_cil_ctx      *xc_ctx;
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
-        sv_t                    xc_commit_wait;
+        wait_queue_head_t       xc_commit_wait;
        xfs_lsn_t               xc_current_sequence;
 };
@@ -491,7 +486,7 @@ typedef struct log {
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-        struct xfs_buf_cancel   **l_buf_cancel_table;
+        struct list_head        *l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sv_t                    l_flush_wait ____cacheline_aligned_in_smp;
+        wait_queue_head_t       l_flush_wait ____cacheline_aligned_in_smp;
                                                /* waiting for iclog flush */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
-        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
-                                                 * buffers */
-        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
-        /* The following block of fields are changed while holding grant_lock */
+        /*
-        spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
+         * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
-        xlog_ticket_t           *l_reserve_headq;
+         * read without needing to hold specific locks. To avoid operations
-        xlog_ticket_t           *l_write_headq;
+         * contending with other hot objects, place each of them on a separate
-        int                     l_grant_reserve_cycle;
+         * cacheline.
-        int                     l_grant_reserve_bytes;
+         */
-        int                     l_grant_write_cycle;
+        /* lsn of last LR on disk */
-        int                     l_grant_write_bytes;
+        atomic64_t              l_last_sync_lsn ____cacheline_aligned_in_smp;
+        /* lsn of 1st LR with unflushed * buffers */
+        atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
+        /*
+         * ticket grant locks, queues and accounting have their own cachlines
+         * as these are quite hot and can be operated on concurrently.
+         */
+        spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_reserveq;
+        atomic64_t              l_grant_reserve_head;
+        spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_writeq;
+        atomic64_t              l_grant_write_head;
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
 } xlog_t;
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+        ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
@@ -562,6 +571,61 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
                                xlog_in_core_t **commit_iclog, uint flags);
 /*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to smaple and crack LSNs taht are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+        xfs_lsn_t val = atomic64_read(lsn);
+        *cycle = CYCLE_LSN(val);
+        *block = BLOCK_LSN(val);
+}
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+        atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+        *cycle = val >> 32;
+        *space = val & 0xffffffff;
+}
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+        xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+        return ((int64_t)cycle << 32) | space;
+}
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+        atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+/*
 * Committed Item List interfaces
 */
 int     xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
 */
 #define XLOG_UNMOUNT_REC_TYPE   (-1U)
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue_exclusive(wq, &wait);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        spin_unlock(lock);
+        schedule();
+        remove_wait_queue(wq, &wait);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458c..aa0ebb776903 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #endif
 /*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+        xfs_daddr_t             bc_blkno;
+        uint                    bc_len;
+        int                     bc_refcount;
+        struct list_head        bc_list;
+};
+/*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
@@ -925,12 +936,12 @@ xlog_find_tail(
        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
        if (found == 2)
                log->l_curr_cycle++;
-        log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
+        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-        log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
+        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-        log->l_grant_reserve_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
-        log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
-        log->l_grant_write_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
-        log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
        /*
         * Look for unmount record.  If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
        }
        after_umount_blk = (i + hblks + (int)
                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-        tail_lsn = log->l_tail_lsn;
+        tail_lsn = atomic64_read(&log->l_tail_lsn);
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
                         * log records will point recovery to after the
                         * current unmount record.
                         */
-                        log->l_tail_lsn =
+                        xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                xlog_assign_lsn(log->l_curr_cycle,
+                                        log->l_curr_cycle, after_umount_blk);
-                                                after_umount_blk);
+                        xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                        log->l_last_sync_lsn =
+                                        log->l_curr_cycle, after_umount_blk);
-                                xlog_assign_lsn(log->l_curr_cycle,
-                                                after_umount_blk);
                        *tail_blk = after_umount_blk;
                        /*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
 * record in the table to tell us how many times we expect to see this
 * record during the second pass.
 */
-STATIC void
+STATIC int
-xlog_recover_do_buffer_pass1(
+xlog_recover_buffer_pass1(
-        xlog_t                  *log,
+        struct log              *log,
-        xfs_buf_log_format_t    *buf_f)
+        xlog_recover_item_t     *item)
 {
-        xfs_buf_cancel_t        *bcp;
+        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_buf_cancel_t        *nextp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
-        xfs_daddr_t             blkno = 0;
-        uint                    len = 0;
-        ushort                  flags = 0;
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        }
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLF_CANCEL)) {
+        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-                return;
+                return 0;
-        }
-        /*
-         * Insert an xfs_buf_cancel record into the hash table of
-         * them.  If there is already an identical record, bump
-         * its reference count.
-         */
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        /*
-         * If the hash bucket is empty then just insert a new record into
-         * the bucket.
-         */
-        if (*bucket == NULL) {
-                bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-                                                     KM_SLEEP);
-                bcp->bc_blkno = blkno;
-                bcp->bc_len = len;
-                bcp->bc_refcount = 1;
-                bcp->bc_next = NULL;
-                *bucket = bcp;
-                return;
        }
        /*
-         * The hash bucket is not empty, so search for duplicates of our
+         * Insert an xfs_buf_cancel record into the hash table of them.
-         * record.  If we find one them just bump its refcount.  If not
+         * If there is already an identical record, bump its reference count.
-         * then add us at the end of the list.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
-        nextp = *bucket;
+        list_for_each_entry(bcp, bucket, bc_list) {
-        while (nextp != NULL) {
+                if (bcp->bc_blkno == buf_f->blf_blkno &&
-                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+                    bcp->bc_len == buf_f->blf_len) {
-                        nextp->bc_refcount++;
+                        bcp->bc_refcount++;
                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-                        return;
+                        return 0;
                }
-                prevp = nextp;
+        }
-                nextp = nextp->bc_next;
-        }
+        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
-        ASSERT(prevp != NULL);
+        bcp->bc_blkno = buf_f->blf_blkno;
-        bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+        bcp->bc_len = buf_f->blf_len;
-                                             KM_SLEEP);
-        bcp->bc_blkno = blkno;
-        bcp->bc_len = len;
        bcp->bc_refcount = 1;
-        bcp->bc_next = NULL;
+        list_add_tail(&bcp->bc_list, bucket);
-        prevp->bc_next = bcp;
        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+        return 0;
 }
 /*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
 */
 STATIC int
 xlog_check_buffer_cancelled(
-        xlog_t                  *log,
+        struct log              *log,
        xfs_daddr_t             blkno,
        uint                    len,
        ushort                  flags)
 {
-        xfs_buf_cancel_t        *bcp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
        if (log->l_buf_cancel_table == NULL) {
                /*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
                return 0;
        }
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        bcp = *bucket;
-        if (bcp == NULL) {
-                /*
-                 * There is no corresponding entry in the table built
-                 * in pass one, so this buffer has not been cancelled.
-                 */
-                ASSERT(!(flags & XFS_BLF_CANCEL));
-                return 0;
-        }
        /*
-         * Search for an entry in the buffer cancel table that
+         * Search for an entry in the  cancel table that matches our buffer.
-         * matches our buffer.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
-        while (bcp != NULL) {
+        list_for_each_entry(bcp, bucket, bc_list) {
-                if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                        /*
+                        goto found;
-                         * We've go a match, so return 1 so that the
-                         * recovery of this buffer is cancelled.
-                         * If this buffer is actually a buffer cancel
-                         * log item, then decrement the refcount on the
-                         * one in the table and remove it if this is the
-                         * last reference.
-                         */
-                        if (flags & XFS_BLF_CANCEL) {
-                                bcp->bc_refcount--;
-                                if (bcp->bc_refcount == 0) {
-                                        if (prevp == NULL) {
-                                                *bucket = bcp->bc_next;
-                                        } else {
-                                                prevp->bc_next = bcp->bc_next;
-                                        }
-                                        kmem_free(bcp);
-                                }
-                        }
-                        return 1;
-                }
-                prevp = bcp;
-                bcp = bcp->bc_next;
        }
        /*
-         * We didn't find a corresponding entry in the table, so
+         * We didn't find a corresponding entry in the table, so return 0 so
-         * return 0 so that the buffer is NOT cancelled.
+         * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
-}
-STATIC int
+found:
-xlog_recover_do_buffer_pass2(
+        /*
-        xlog_t                  *log,
+         * We've go a match, so return 1 so that the recovery of this buffer
-        xfs_buf_log_format_t    *buf_f)
+         * is cancelled.  If this buffer is actually a buffer cancel log
-{
+         * item, then decrement the refcount on the one in the table and
-        xfs_daddr_t             blkno = 0;
+         * remove it if this is the last reference.
-        ushort                  flags = 0;
+         */
-        uint                    len = 0;
+        if (flags & XFS_BLF_CANCEL) {
+                if (--bcp->bc_refcount == 0) {
-        switch (buf_f->blf_type) {
+                        list_del(&bcp->bc_list);
-        case XFS_LI_BUF:
+                        kmem_free(bcp);
-                blkno = buf_f->blf_blkno;
+                }
-                flags = buf_f->blf_flags;
-                len = buf_f->blf_len;
-                break;
        }
+        return 1;
-        return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers,
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
- * the only data which should be recovered is that which corresponds
+ * data which should be recovered is that which corresponds to the
- * to the di_next_unlinked pointers in the on disk inode structures.
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
- * The rest of the data for the inodes is always logged through the
+ * data for the inodes is always logged through the inodes themselves rather
- * inodes themselves rather than the inode buffer and is recovered
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
- * in xlog_recover_do_inode_trans().
 *
- * The only time when buffers full of inodes are fully recovered is
+ * The only time when buffers full of inodes are fully recovered is when the
- * when the buffer is full of newly allocated inodes.  In this case
+ * buffer is full of newly allocated inodes.  In this case the buffer will
- * the buffer will not be marked as an inode buffer and so will be
+ * not be marked as an inode buffer and so will be sent to
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * xlog_recover_do_reg_buffer() below during recovery.
 */
 STATIC int
 xlog_recover_do_inode_buffer(
-        xfs_mount_t             *mp,
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
-        int                     item_index;
+        int                     item_index = 0;
-        int                     bit;
+        int                     bit = 0;
-        int                     nbits;
+        int                     nbits = 0;
-        int                     reg_buf_offset;
+        int                     reg_buf_offset = 0;
-        int                     reg_buf_bytes;
+        int                     reg_buf_bytes = 0;
        int                     next_unlinked_offset;
        int                     inodes_per_buf;
        xfs_agino_t             *logged_nextp;
        xfs_agino_t             *buffer_nextp;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
-        /*
-         * Set the variables corresponding to the current region to
-         * 0 so that we'll initialize them on the first pass through
-         * the loop.
-         */
-        reg_buf_offset = 0;
-        reg_buf_bytes = 0;
-        bit = 0;
-        nbits = 0;
-        item_index = 0;
        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
        for (i = 0; i < inodes_per_buf; i++) {
                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
                         * the current di_next_unlinked field.
                         */
                        bit += nbits;
-                        bit = xfs_next_bit(data_map, map_size, bit);
+                        bit = xfs_next_bit(buf_f->blf_data_map,
+                                           buf_f->blf_map_size, bit);
                        /*
                         * If there are no more logged regions in the
                         * buffer, then we're done.
                         */
-                        if (bit == -1) {
+                        if (bit == -1)
                                return 0;
-                        }
-                        nbits = xfs_contig_bits(data_map, map_size,
+                        nbits = xfs_contig_bits(buf_f->blf_data_map,
-                                                         bit);
+                                                buf_f->blf_map_size, bit);
                        ASSERT(nbits > 0);
                        reg_buf_offset = bit << XFS_BLF_SHIFT;
                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
                 * di_next_unlinked field, then move on to the next
                 * di_next_unlinked field.
                 */
-                if (next_unlinked_offset < reg_buf_offset) {
+                if (next_unlinked_offset < reg_buf_offset)
                        continue;
-                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
 * given buffer.  The bitmap in the buf log format structure indicates
 * where to place the logged data.
 */
-/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
        int                     bit;
        int                     nbits;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        int                     error;
        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
        bit = 0;
        i = 1;  /* 0 is the buf format structure */
        while (1) {
-                bit = xfs_next_bit(data_map, map_size, bit);
+                bit = xfs_next_bit(buf_f->blf_data_map,
+                                   buf_f->blf_map_size, bit);
                if (bit == -1)
                        break;
-                nbits = xfs_contig_bits(data_map, map_size, bit);
+                nbits = xfs_contig_bits(buf_f->blf_data_map,
+                                        buf_f->blf_map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
 * for more details on the implementation of the table of cancel records.
 */
 STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
-        int                     cancel;
-        xfs_daddr_t             blkno;
-        int                     len;
-        ushort                  flags;
        uint                    buf_flags;
-        if (pass == XLOG_RECOVER_PASS1) {
+        /*
-                /*
+         * In this pass we only want to recover all the buffers which have
-                 * In this pass we're only looking for buf items
+         * not been cancelled and are not cancellation buffers themselves.
-                 * with the XFS_BLF_CANCEL bit set.
+         */
-                 */
+        if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
-                xlog_recover_do_buffer_pass1(log, buf_f);
+                        buf_f->blf_len, buf_f->blf_flags)) {
+                trace_xfs_log_recover_buf_cancel(log, buf_f);
                return 0;
-        } else {
-                /*
-                 * In this pass we want to recover all the buffers
-                 * which have not been cancelled and are not
-                 * cancellation buffers themselves.  The routine
-                 * we call here will tell us whether or not to
-                 * continue with the replay of this buffer.
-                 */
-                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
-                if (cancel) {
-                        trace_xfs_log_recover_buf_cancel(log, buf_f);
-                        return 0;
-                }
        }
        trace_xfs_log_recover_buf_recover(log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        default:
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
-                        "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
-                        buf_f->blf_type, log->l_mp->m_logname ?
-                        log->l_mp->m_logname : "internal");
-                XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
-                                 XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLF_INODE_BUF))
+        if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+                          buf_flags);
        if (XFS_BUF_ISERROR(bp)) {
-                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+                xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-                                  bp, blkno);
+                                  bp, buf_f->blf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                return error;
        }
        error = 0;
-        if (flags & XFS_BLF_INODE_BUF) {
+        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-        } else if (flags &
+        } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
 }
 STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_inode_log_format_t  *in_f;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        xfs_dinode_t            *dip;
-        xfs_ino_t               ino;
        int                     len;
        xfs_caddr_t             src;
        xfs_caddr_t             dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
        xfs_icdinode_t          *dicp;
        int                     need_free = 0;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
                if (error)
                        goto error;
        }
-        ino = in_f->ilf_ino;
-        mp = log->l_mp;
        /*
         * Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
-                        dip, bp, ino);
+                        dip, bp, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                        item, ino);
+                        item, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
-                        item, dip, bp, ino,
+                        item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
-                        item, dip, bp, ino, dicp->di_forkoff);
+                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
                        break;
                default:
-                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+                        xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2556,18 +2422,11 @@ error:
 * of that type.
 */
 STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_qoff_logformat_t    *qoff_f;
+        xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
-        if (pass == XLOG_RECOVER_PASS2) {
-                return (0);
-        }
-        qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
        /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
 * Recover a dquot record
 */
 STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        struct xfs_disk_dquot   *ddq, *recddq;
        int                     error;
        xfs_dq_logformat_t      *dq_f;
        uint                    type;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
-        mp = log->l_mp;
        /*
         * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
        if ((error = xfs_qm_dqcheck(recddq,
                           dq_f->qlf_id,
                           0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans (log copy)"))) {
+                           "xlog_recover_dquot_pass2 (log copy)"))) {
                return XFS_ERROR(EIO);
        }
        ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
         * minimal initialization then.
         */
        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans")) {
+                           "xlog_recover_dquot_pass2")) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
 * LSN.
 */
 STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
        xlog_t                  *log,
        xlog_recover_item_t     *item,
-        xfs_lsn_t               lsn,
+        xfs_lsn_t               lsn)
-        int                     pass)
 {
        int                     error;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_efi_log_item_t      *efip;
        xfs_efi_log_format_t    *efi_formatp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        efi_formatp = item->ri_buf[0].i_addr;
-        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
                                         &(efip->efi_format)))) {
                xfs_efi_item_free(efip);
                return error;
        }
-        efip->efi_next_extent = efi_formatp->efi_nextents;
+        atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-        efip->efi_flags |= XFS_EFI_COMMITTED;
        spin_lock(&log->l_ailp->xa_lock);
        /*
         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
        return 0;
 }
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
 * efd format structure.  If we find it, we remove the efi from the
 * AIL and free it.
 */
-STATIC void
+STATIC int
-xlog_recover_do_efd_trans(
+xlog_recover_efd_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp = log->l_ailp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return;
-        }
        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
        }
        xfs_trans_ail_cursor_done(ailp, &cur);
        spin_unlock(&ailp->xa_lock);
-}
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
-        xlog_t                  *log,
-        xlog_recover_t          *trans,
-        int                     pass)
-{
-        int                     error = 0;
-        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(log, trans, pass);
-        if (error)
-                return error;
-        list_for_each_entry(item, &trans->r_itemq, ri_list) {
-                trace_xfs_log_recover_item_recover(log, trans, item, pass);
-                switch (ITEM_TYPE(item)) {
-                case XFS_LI_BUF:
-                        error = xlog_recover_do_buffer_trans(log, item, pass);
-                        break;
-                case XFS_LI_INODE:
-                        error = xlog_recover_do_inode_trans(log, item, pass);
-                        break;
-                case XFS_LI_EFI:
-                        error = xlog_recover_do_efi_trans(log, item,
-                                                          trans->r_lsn, pass);
-                        break;
-                case XFS_LI_EFD:
-                        xlog_recover_do_efd_trans(log, item, pass);
-                        error = 0;
-                        break;
-                case XFS_LI_DQUOT:
-                        error = xlog_recover_do_dquot_trans(log, item, pass);
-                        break;
-                case XFS_LI_QUOTAOFF:
-                        error = xlog_recover_do_quotaoff_trans(log, item,
-                                                               pass);
-                        break;
-                default:
-                        xlog_warn(
-        "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
-                        ASSERT(0);
-                        error = XFS_ERROR(EIO);
-                        break;
-                }
-                if (error)
-                        return error;
-        }
        return 0;
 }
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
 */
 STATIC void
 xlog_recover_free_trans(
-        xlog_recover_t          *trans)
+        struct xlog_recover     *trans)
 {
        xlog_recover_item_t     *item, *n;
        int                     i;
@@ -2871,17 +2657,95 @@ xlog_recover_free_trans(
 }
 STATIC int
+xlog_recover_commit_pass1(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass1(log, item);
+        case XFS_LI_QUOTAOFF:
+                return xlog_recover_quotaoff_pass1(log, item);
+        case XFS_LI_INODE:
+        case XFS_LI_EFI:
+        case XFS_LI_EFD:
+        case XFS_LI_DQUOT:
+                /* nothing to do in pass 1 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass1",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+STATIC int
+xlog_recover_commit_pass2(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass2(log, item);
+        case XFS_LI_INODE:
+                return xlog_recover_inode_pass2(log, item);
+        case XFS_LI_EFI:
+                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+        case XFS_LI_EFD:
+                return xlog_recover_efd_pass2(log, item);
+        case XFS_LI_DQUOT:
+                return xlog_recover_dquot_pass2(log, item);
+        case XFS_LI_QUOTAOFF:
+                /* nothing to do in pass2 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass2",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
 xlog_recover_commit_trans(
-        xlog_t                  *log,
+        struct log              *log,
-        xlog_recover_t          *trans,
+        struct xlog_recover     *trans,
        int                     pass)
 {
-        int                     error;
+        int                     error = 0;
+        xlog_recover_item_t     *item;
        hlist_del(&trans->r_list);
-        if ((error = xlog_recover_do_trans(log, trans, pass)))
+        error = xlog_recover_reorder_trans(log, trans, pass);
+        if (error)
                return error;
-        xlog_recover_free_trans(trans);                 /* no error */
+        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                if (pass == XLOG_RECOVER_PASS1)
+                        error = xlog_recover_commit_pass1(log, trans, item);
+                else
+                        error = xlog_recover_commit_pass2(log, trans, item);
+                if (error)
+                        return error;
+        }
+        xlog_recover_free_trans(trans);
        return 0;
 }
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
-        ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+        ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
        /*
         * First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
                                         extp->ext_len);
        }
-        efip->efi_flags |= XFS_EFI_RECOVERED;
+        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
        error = xfs_trans_commit(tp, 0);
        return error;
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
                 * Skip EFIs that we've already processed.
                 */
                efip = (xfs_efi_log_item_t *)lip;
-                if (efip->efi_flags & XFS_EFI_RECOVERED) {
+                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
        xfs_daddr_t     head_blk,
        xfs_daddr_t     tail_blk)
 {
-        int             error;
+        int             error, i;
        ASSERT(head_blk != tail_blk);
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
         * First do a pass to find all of the cancelled buf log items.
         * Store them in the buf_cancel_table for use in the second pass.
         */
-        log->l_buf_cancel_table =
+        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-                (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+                                                 sizeof(struct list_head),
-                                                 sizeof(xfs_buf_cancel_t*),
                                                 KM_SLEEP);
+        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS1);
        if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
                int     i;
                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-                        ASSERT(log->l_buf_cancel_table[i] == NULL);
+                        ASSERT(list_empty(&log->l_buf_cancel_table[i]));
        }
 #endif  /* DEBUG */
@@ -3934,7 +3800,7 @@ xlog_recover_finish(
                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
        } else {
                cmn_err(CE_DEBUG,
-                        "!Ending clean XFS mount for filesystem: %s\n",
+                        "Ending clean XFS mount for filesystem: %s\n",
                        log->l_mp->m_fsname);
        }
        return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa1c254..d447aef84bc3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -472,7 +472,7 @@ xfs_initialize_perag(
                        goto out_unwind;
                pag->pag_agno = index;
                pag->pag_mount = mp;
-                rwlock_init(&pag->pag_ici_lock);
+                spin_lock_init(&pag->pag_ici_lock);
                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                spin_lock_init(&pag->pag_buf_lock);
@@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 }
 /*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+        struct xfs_mount        *mp)
+{
+        int i;
+        for (i = 0; i < XFS_LOWSP_MAX; i++) {
+                __uint64_t space = mp->m_sb.sb_dblocks;
+                do_div(space, 100);
+                mp->m_low_space[i] = space * (i + 1);
+        }
+}
+/*
 * Set whether we're using inode alignment.
 */
 STATIC void
@@ -1196,6 +1214,9 @@ xfs_mountfs(
         */
        xfs_set_rw_sizes(mp);
+        /* set the low space thresholds for dynamic preallocation */
+        xfs_set_low_space_thresholds(mp);
        /*
         * Set the inode cluster size.
         * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b4980740..a62e8971539d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
        xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+        XFS_LOWSP_1_PCNT = 0,
+        XFS_LOWSP_2_PCNT,
+        XFS_LOWSP_3_PCNT,
+        XFS_LOWSP_4_PCNT,
+        XFS_LOWSP_5_PCNT,
+        XFS_LOWSP_MAX,
+};
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
+        int64_t                 m_low_space[XFS_LOWSP_MAX];
+                                                /* low free space thresholds */
 } xfs_mount_t;
 /*
@@ -379,6 +391,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
+extern void     xfs_set_low_space_thresholds(struct xfs_mount *);
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711e..33dbc4e0ad62 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
        if (blkdelta)
                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
 out:
-        ASSERT(error = 0);
+        ASSERT(error == 0);
        return;
 }
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
 * they could be immediately flushed and we'd have to race with the flusher
 * trying to pull the item from the AIL as we add it.
 */
-void
+static void
 xfs_trans_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               commit_lsn,
@@ -1425,6 +1425,83 @@ xfs_trans_committed(
        xfs_trans_free(tp);
 }
+static inline void
+xfs_log_item_batch_insert(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     **log_items,
+        int                     nr_items,
+        xfs_lsn_t               commit_lsn)
+{
+        int     i;
+        spin_lock(&ailp->xa_lock);
+        /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+        xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+        for (i = 0; i < nr_items; i++)
+                IOP_UNPIN(log_items[i], 0);
+}
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ */
+void
+xfs_trans_committed_bulk(
+        struct xfs_ail          *ailp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               commit_lsn,
+        int                     aborted)
+{
+#define LOG_ITEM_BATCH_SIZE     32
+        struct xfs_log_item     *log_items[LOG_ITEM_BATCH_SIZE];
+        struct xfs_log_vec      *lv;
+        int                     i = 0;
+        /* unpin all the log items */
+        for (lv = log_vector; lv; lv = lv->lv_next ) {
+                struct xfs_log_item     *lip = lv->lv_item;
+                xfs_lsn_t               item_lsn;
+                if (aborted)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                item_lsn = IOP_COMMITTED(lip, commit_lsn);
+                /* item_lsn of -1 means the item was freed */
+                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                        continue;
+                if (item_lsn != commit_lsn) {
+                        /*
+                         * Not a bulk update option due to unusual item_lsn.
+                         * Push into AIL immediately, rechecking the lsn once
+                         * we have the ail lock. Then unpin the item.
+                         */
+                        spin_lock(&ailp->xa_lock);
+                        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+                                xfs_trans_ail_update(ailp, lip, item_lsn);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                        IOP_UNPIN(lip, 0);
+                        continue;
+                }
+                /* Item is a candidate for bulk AIL insert.  */
+                log_items[i++] = lv->lv_item;
+                if (i >= LOG_ITEM_BATCH_SIZE) {
+                        xfs_log_item_batch_insert(ailp, log_items,
+                                        LOG_ITEM_BATCH_SIZE, commit_lsn);
+                        i = 0;
+                }
+        }
+        /* make sure we insert the remainder! */
+        if (i)
+                xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
 /*
 * Called from the trans_commit code when we notice that
 * the filesystem is in the middle of a forced shutdown.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a86..c2042b736b81 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define XFS_ALLOC_BTREE_REF     2
 #define XFS_BMAP_BTREE_REF      2
 #define XFS_DIR_BTREE_REF       2
+#define XFS_INO_REF             2
 #define XFS_ATTR_BTREE_REF      1
-#define XFS_INO_REF             1
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..c5bbbc45db91 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
 STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
 STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
 /*
- * Update the position of the item in the AIL with the new
+ * xfs_trans_ail_update - bulk AIL insertion operation.
- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
+ *
- * it to its new position by removing it and re-adding it.
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
 *
- * Wakeup anyone with an lsn less than the item's lsn.  If the item
+ * This function takes the AIL lock once to execute the update operations on
- * we move in the AIL is the minimum one, update the tail lsn in the
+ * all the items in the array, and as such should not be called with the AIL
- * log manager.
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * To optimise the insert operation, we delete all the items from the AIL in
- * is dropped before returning.
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_update(
+xfs_trans_ail_update_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip,
+        struct xfs_log_item     **log_items,
-        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
+        int                     nr_items,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip = NULL;
+        xfs_log_item_t          *mlip;
-        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
+        LIST_HEAD(tmp);
        mlip = xfs_ail_min(ailp);
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        for (i = 0; i < nr_items; i++) {
-                dlip = xfs_ail_delete(ailp, lip);
+                struct xfs_log_item *lip = log_items[i];
-                ASSERT(dlip == lip);
+                if (lip->li_flags & XFS_LI_IN_AIL) {
-                xfs_trans_ail_cursor_clear(ailp, dlip);
+                        /* check if we really need to move the item */
-        } else {
+                        if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
-                lip->li_flags |= XFS_LI_IN_AIL;
+                                continue;
+                        xfs_ail_delete(ailp, lip);
+                        if (mlip == lip)
+                                mlip_changed = 1;
+                } else {
+                        lip->li_flags |= XFS_LI_IN_AIL;
+                }
+                lip->li_lsn = lsn;
+                list_add(&lip->li_ail, &tmp);
        }
-        lip->li_lsn = lsn;
+        xfs_ail_splice(ailp, &tmp, lsn);
-        xfs_ail_insert(ailp, lip);
-        if (mlip == dlip) {
+        if (!mlip_changed) {
-                mlip = xfs_ail_min(ailp);
-                /*
-                 * It is not safe to access mlip after the AIL lock is
-                 * dropped, so we must get a copy of li_lsn before we do
-                 * so.  This is especially important on 32-bit platforms
-                 * where accessing and updating 64-bit values like li_lsn
-                 * is not atomic.
-                 */
-                tail_lsn = mlip->li_lsn;
-                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-        } else {
                spin_unlock(&ailp->xa_lock);
+                return;
        }
+        /*
-}       /* xfs_trans_update_ail */
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip->li_lsn;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
- * Delete the given item from the AIL.  It must already be in
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
- * the AIL.
 *
- * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
- * we delete in the AIL is the minimum one, update the tail lsn in the
+ * removed from the AIL. The caller is already holding the AIL lock, and done
- * log manager.
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
 *
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
- * bump the AIL's generation count to indicate that the tree
+ * flag from the item and reset the item's lsn to 0. If we remove the first
- * has changed.
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * This function will not drop the AIL lock until all items are removed from
- * is dropped before returning.
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_delete(
+xfs_trans_ail_delete_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+        struct xfs_log_item     **log_items,
+        int                     nr_items) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        mlip = xfs_ail_min(ailp);
-                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(ailp, lip);
-                ASSERT(dlip == lip);
-                xfs_trans_ail_cursor_clear(ailp, dlip);
-                lip->li_flags &= ~XFS_LI_IN_AIL;
+        for (i = 0; i < nr_items; i++) {
-                lip->li_lsn = 0;
+                struct xfs_log_item *lip = log_items[i];
+                if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+                        struct xfs_mount        *mp = ailp->xa_mount;
-                if (mlip == dlip) {
-                        mlip = xfs_ail_min(ailp);
-                        /*
-                         * It is not safe to access mlip after the AIL lock
-                         * is dropped, so we must get a copy of li_lsn
-                         * before we do so.  This is especially important
-                         * on 32-bit platforms where accessing and updating
-                         * 64-bit values like li_lsn is not atomic.
-                         */
-                        tail_lsn = mlip ? mlip->li_lsn : 0;
-                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-                } else {
                        spin_unlock(&ailp->xa_lock);
+                        if (!XFS_FORCED_SHUTDOWN(mp)) {
+                                xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+                "%s: attempting to delete a log item that is not in the AIL",
+                                                __func__);
+                                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                        }
+                        return;
                }
+                xfs_ail_delete(ailp, lip);
+                lip->li_flags &= ~XFS_LI_IN_AIL;
+                lip->li_lsn = 0;
+                if (mlip == lip)
+                        mlip_changed = 1;
        }
-        else {
-                /*
-                 * If the file system is not being shutdown, we are in
-                 * serious trouble if we get to this stage.
-                 */
-                struct xfs_mount        *mp = ailp->xa_mount;
+        if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
-                if (!XFS_FORCED_SHUTDOWN(mp)) {
+                return;
-                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-                "%s: attempting to delete a log item that is not in the AIL",
-                                        __func__);
-                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                }
        }
-}
+        /*
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic. It is possible we've emptied the
+         * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip ? mlip->li_lsn : 0;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
 * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
 }
 /*
- * Insert the given log item into the AIL.
+ * splice the log item list into the AIL at the given LSN.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
 */
 STATIC void
-xfs_ail_insert(
+xfs_ail_splice(
        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
+        struct list_head *list,
-/* ARGSUSED */
+        xfs_lsn_t       lsn)
 {
        xfs_log_item_t  *next_lip;
@@ -640,39 +660,33 @@ xfs_ail_insert(
         * If the list is empty, just insert the item.
         */
        if (list_empty(&ailp->xa_ail)) {
-                list_add(&lip->li_ail, &ailp->xa_ail);
+                list_splice(list, &ailp->xa_ail);
                return;
        }
        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-                if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
                        break;
        }
        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
+               (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
-        list_add(&lip->li_ail, &next_lip->li_ail);
-        xfs_ail_check(ailp, lip);
+        list_splice_init(list, &next_lip->li_ail);
        return;
 }
 /*
 * Delete the given item from the AIL.  Return a pointer to the item.
 */
-/*ARGSUSED*/
+STATIC void
-STATIC xfs_log_item_t *
 xfs_ail_delete(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        xfs_ail_check(ailp, lip);
        list_del(&lip->li_ail);
+        xfs_trans_ail_cursor_clear(ailp, lip);
-        return lip;
 }
 /*
@@ -682,7 +696,6 @@ xfs_ail_delete(
 STATIC xfs_log_item_t *
 xfs_ail_min(
        struct xfs_ail  *ailp)
-/* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
                return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
 xfs_ail_next(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        if (lip->li_ail.next == &ailp->xa_ail)
                return NULL;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-        next_extent = efip->efi_next_extent;
+        /*
+         * atomic_inc_return gives us the value after the increment;
+         * we want to use it as an array index so we need to subtract 1 from
+         * it.
+         */
+        next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
        extp = &(efip->efi_format.efi_extents[next_extent]);
        extp->ext_start = start_block;
        extp->ext_len = ext_len;
-        efip->efi_next_extent++;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..35162c238fa3 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
 void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
                                int flags);
-void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+void    xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+                                xfs_lsn_t commit_lsn, int aborted);
 /*
 * AIL traversal cursor.
 *
@@ -73,12 +75,29 @@ struct xfs_ail {
 /*
 * From xfs_trans_ail.c
 */
-void                    xfs_trans_ail_update(struct xfs_ail *ailp,
+void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
-                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
+                                struct xfs_log_item **log_items, int nr_items,
-                                        __releases(ailp->xa_lock);
+                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
-void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+static inline void
-                                        struct xfs_log_item *lip)
+xfs_trans_ail_update(
-                                        __releases(ailp->xa_lock);
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+void    xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+                                struct xfs_log_item **log_items, int nr_items)
+                                __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
 void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
 void                    xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151a..d8e6f8cd6f0c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
                        xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
        }
-        if (ip->i_d.di_nlink != 0) {
+        if (ip->i_d.di_nlink == 0)
-                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                return 0;
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                    (!(ip->i_d.di_flags &
-                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                        /*
+        if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                         * If we can't get the iolock just skip truncating
+             ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                         * the blocks past EOF because we could deadlock
+               ip->i_delayed_blks > 0)) &&
-                         * with the mmap_sem otherwise.  We'll get another
+             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                         * chance to drop them once the last reference to
+            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                         * the inode is dropped, so we'll never leak blocks
-                         * permanently.
-                         */
-                        error = xfs_free_eofblocks(mp, ip,
-                                                   XFS_FREE_EOF_TRYLOCK);
-                        if (error)
-                                return error;
-                }
-        }
+                /*
+                 * If we can't get the iolock just skip truncating the blocks
+                 * past EOF because we could deadlock with the mmap_sem
+                 * otherwise.  We'll get another chance to drop them once the
+                 * last reference to the inode is dropped, so we'll never leak
+                 * blocks permanently.
+                 *
+                 * Further, check if the inode is being opened, written and
+                 * closed frequently and we have delayed allocation blocks
+                 * oustanding (e.g. streaming writes from the NFS server),
+                 * truncating the blocks past EOF will cause fragmentation to
+                 * occur.
+                 *
+                 * In this case don't do the truncation, either, but we have to
+                 * be careful how we detect this case. Blocks beyond EOF show
+                 * up as i_delayed_blks even when the inode is clean, so we
+                 * need to truncate them away first before checking for a dirty
+                 * release. Hence on the first dirty close we will still remove
+                 * the speculative allocation, but after that we will leave it
+                 * in place.
+                 */
+                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                        return 0;
+                error = xfs_free_eofblocks(mp, ip,
+                                           XFS_FREE_EOF_TRYLOCK);
+                if (error)
+                        return error;
+                /* delalloc blocks after truncation means it really is dirty */
+                if (ip->i_delayed_blks)
+                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+        }
        return 0;
 }