192 files changed, 13426 insertions, 8596 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 7e0511476797..814ac4e213a8 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -9,6 +9,8 @@ config 9P_FS
          If unsure, say N.
+if 9P_FS
 config 9P_FSCACHE
        bool "Enable 9P client caching support (EXPERIMENTAL)"
        depends on EXPERIMENTAL
@@ -20,7 +22,6 @@ config 9P_FSCACHE
 config 9P_FS_POSIX_ACL
        bool "9P POSIX Access Control Lists"
-        depends on 9P_FS
        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
@@ -30,3 +31,5 @@ config 9P_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
          If you don't know what Access Control Lists are, say N
+endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index f8ba37effd1b..ab8c12780634 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
 9p-objs := \
        vfs_super.o \
        vfs_inode.o \
+        vfs_inode_dotl.o \
        vfs_addr.o \
        vfs_file.o \
        vfs_dir.o \
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 6e58c4ca1e6e..02a2cf616318 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -28,7 +28,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
        ssize_t size;
        void *value = NULL;
-        struct posix_acl *acl = NULL;;
+        struct posix_acl *acl = NULL;
        size = v9fs_fid_xattr_get(fid, name, NULL, 0);
        if (size > 0) {
@@ -365,7 +365,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
        case ACL_TYPE_DEFAULT:
                name = POSIX_ACL_XATTR_DEFAULT;
                if (!S_ISDIR(inode->i_mode)) {
-                        retval = -EINVAL;
+                        retval = acl ? -EINVAL : 0;
                        goto err_out;
                }
                break;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index cb6396855e2d..c4b5d8864f0d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -113,9 +113,27 @@ struct v9fs_session_info {
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
-void v9fs_session_close(struct v9fs_session_info *v9ses);
+extern void v9fs_session_close(struct v9fs_session_info *v9ses);
-void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
-void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
+extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+                        struct nameidata *nameidata);
+extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
+extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry);
+extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
+                        void *p);
+extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
+extern const struct inode_operations v9fs_dir_inode_operations_dotl;
+extern const struct inode_operations v9fs_file_inode_operations_dotl;
+extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+                        struct p9_fid *fid,
+                        struct super_block *sb);
 /* other default globals */
 #define V9FS_PORT       564
@@ -138,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 {
        return v9ses->flags & V9FS_PROTO_2000L;
 }
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                                struct super_block *sb)
+{
+        if (v9fs_proto_dotl(v9ses))
+                return v9fs_inode_dotl(v9ses, fid, sb);
+        else
+                return v9fs_inode(v9ses, fid, sb);
+}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 59782981b225..5076eeb95502 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -49,15 +49,8 @@
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
-static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
-static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
-static const struct inode_operations v9fs_symlink_inode_operations_dotl;
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                    dev_t rdev);
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -251,41 +244,6 @@ void v9fs_destroy_inode(struct inode *inode)
 #endif
 /**
- * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
- * new file system object. This checks the S_ISGID to determine the owning
- * group of the new file system object.
- */
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
-{
-        BUG_ON(dir_inode == NULL);
-        if (dir_inode->i_mode & S_ISGID) {
-                /* set_gid bit is set.*/
-                return dir_inode->i_gid;
-        }
-        return current_fsgid();
-}
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
-{
-        struct dentry *dentry;
-        spin_lock(&inode->i_lock);
-        /* Directory should have only one entry. */
-        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        spin_unlock(&inode->i_lock);
-        return dentry;
-}
-/**
 * v9fs_get_inode - helper function to setup an inode
 * @sb: superblock
 * @mode: mode to setup inode with
@@ -454,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode)
 #endif
 }
-static struct inode *
+struct inode *
 v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        struct super_block *sb)
 {
@@ -489,60 +447,6 @@ error:
        return ERR_PTR(err);
 }
-static struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-        struct super_block *sb)
-{
-        struct inode *ret = NULL;
-        int err;
-        struct p9_stat_dotl *st;
-        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-        if (IS_ERR(st))
-                return ERR_CAST(st);
-        ret = v9fs_get_inode(sb, st->st_mode);
-        if (IS_ERR(ret)) {
-                err = PTR_ERR(ret);
-                goto error;
-        }
-        v9fs_stat2inode_dotl(st, ret);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
-#endif
-        err = v9fs_get_acl(ret, fid);
-        if (err) {
-                iput(ret);
-                goto error;
-        }
-        kfree(st);
-        return ret;
-error:
-        kfree(st);
-        return ERR_PTR(err);
-}
-/**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
- * issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-                        struct super_block *sb)
-{
-        if (v9fs_proto_dotl(v9ses))
-                return v9fs_inode_dotl(v9ses, fid, sb);
-        else
-                return v9fs_inode(v9ses, fid, sb);
-}
 /**
 * v9fs_remove - helper function to remove files and directories
 * @dir: directory inode that is being deleted
@@ -633,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        if (v9ses->cache)
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-        else
-                d_set_d_op(dentry, &v9fs_dentry_operations);
        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
@@ -657,144 +555,6 @@ error:
 }
 /**
- * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
- * @dir: directory inode that is being created
- * @dentry:  dentry that is being deleted
- * @mode: create permissions
- * @nd: path information
- *
- */
-static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                struct nameidata *nd)
-{
-        int err = 0;
-        char *name = NULL;
-        gid_t gid;
-        int flags;
-        mode_t mode;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL;
-        struct p9_fid *dfid, *ofid;
-        struct file *filp;
-        struct p9_qid qid;
-        struct inode *inode;
-        struct posix_acl *pacl = NULL, *dacl = NULL;
-        v9ses = v9fs_inode2v9ses(dir);
-        if (nd && nd->flags & LOOKUP_OPEN)
-                flags = nd->intent.open.flags - 1;
-        else {
-                /*
-                 * create call without LOOKUP_OPEN is due
-                 * to mknod of regular files. So use mknod
-                 * operation.
-                 */
-                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
-        }
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-                        "mode:0x%x\n", name, flags, omode);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        /* clone a fid to use for creation */
-        ofid = p9_client_walk(dfid, 0, NULL, 1);
-        if (IS_ERR(ofid)) {
-                err = PTR_ERR(ofid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in creat %d\n", err);
-                goto error;
-        }
-        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                                "p9_client_open_dotl failed in creat %d\n",
-                                err);
-                goto error;
-        }
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
-            (nd && nd->flags & LOOKUP_OPEN)) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                /* The fid would get clunked via a dput */
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate
-                 * inode with stat. We need to get an inode
-                 * so that we can set the acl with dentry
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_dentry_operations);
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-        /* if we are opening a file, assign the open fid to the file */
-        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
-                if (IS_ERR(filp)) {
-                        p9_client_clunk(ofid);
-                        return PTR_ERR(filp);
-                }
-                filp->private_data = ofid;
-        } else
-                p9_client_clunk(ofid);
-        return 0;
-error:
-        if (ofid)
-                p9_client_clunk(ofid);
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_create - VFS hook to create files
 * @dir: directory inode that is being created
 * @dentry:  dentry that is being deleted
@@ -884,107 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return err;
 }
-/**
- * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
- * @dir:  inode that is being unlinked
- * @dentry: dentry that is being unlinked
- * @mode: mode for new directory
- *
- */
-static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-                               struct dentry *dentry, int omode)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        gid_t gid;
-        char *name;
-        mode_t mode;
-        struct inode *inode;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
-        err = 0;
-        v9ses = v9fs_inode2v9ses(dir);
-        omode |= S_IFDIR;
-        if (dir->i_mode & S_ISGID)
-                omode |= S_ISGID;
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in mkdir %d\n", err);
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate
-                 * inode with stat. We need to get an inode
-                 * so that we can set the acl with dentry
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_dentry_operations);
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
 /**
 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
 * @dir:  inode that is being walked from
@@ -993,7 +652,7 @@ error:
 *
 */
-static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nameidata)
 {
        struct super_block *sb;
@@ -1063,7 +722,7 @@ error:
 *
 */
-static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 0);
 }
@@ -1075,7 +734,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 *
 */
-static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
        return v9fs_remove(i, d, 1);
 }
@@ -1089,7 +748,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 *
 */
-static int
+int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -1196,42 +855,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
-static int
-v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
-                 struct kstat *stat)
-{
-        int err;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_stat_dotl *st;
-        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-        err = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-                return simple_getattr(mnt, dentry, stat);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        /* Ask for all the fields in stat structure. Server will return
-         * whatever it supports
-         */
-        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
-        if (IS_ERR(st))
-                return PTR_ERR(st);
-        v9fs_stat2inode_dotl(st, dentry->d_inode);
-        generic_fillattr(dentry->d_inode, stat);
-        /* Change block size to what the server returned */
-        stat->blksize = st->st_blksize;
-        kfree(st);
-        return 0;
-}
 /**
 * v9fs_vfs_setattr - set file metadata
 * @dentry: file whose metadata to set
@@ -1291,64 +914,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 /**
- * v9fs_vfs_setattr_dotl - set file metadata
- * @dentry: file whose metadata to set
- * @iattr: metadata assignment structure
- *
- */
-int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
-{
-        int retval;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
-        struct p9_iattr_dotl p9attr;
-        P9_DPRINTK(P9_DEBUG_VFS, "\n");
-        retval = inode_change_ok(dentry->d_inode, iattr);
-        if (retval)
-                return retval;
-        p9attr.valid = iattr->ia_valid;
-        p9attr.mode = iattr->ia_mode;
-        p9attr.uid = iattr->ia_uid;
-        p9attr.gid = iattr->ia_gid;
-        p9attr.size = iattr->ia_size;
-        p9attr.atime_sec = iattr->ia_atime.tv_sec;
-        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
-        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
-        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
-        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        retval = p9_client_setattr(fid, &p9attr);
-        if (retval < 0)
-                return retval;
-        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(dentry->d_inode)) {
-                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
-                if (retval)
-                        return retval;
-        }
-        setattr_copy(dentry->d_inode, iattr);
-        mark_inode_dirty(dentry->d_inode);
-        if (iattr->ia_valid & ATTR_MODE) {
-                /* We also want to update ACL when we update mode bits */
-                retval = v9fs_acl_chmod(dentry);
-                if (retval < 0)
-                        return retval;
-        }
-        return 0;
-}
-/**
 * v9fs_stat2inode - populate an inode structure with mistat info
 * @stat: Plan 9 metadata (mistat) structure
 * @inode: inode to populate
@@ -1426,77 +991,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 /**
- * v9fs_stat2inode_dotl - populate an inode structure with stat info
- * @stat: stat structure
- * @inode: inode to populate
- * @sb: superblock of filesystem
- *
- */
-void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
-{
-        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
-                inode->i_atime.tv_sec = stat->st_atime_sec;
-                inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                inode->i_uid = stat->st_uid;
-                inode->i_gid = stat->st_gid;
-                inode->i_nlink = stat->st_nlink;
-                inode->i_mode = stat->st_mode;
-                inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
-                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
-                i_size_write(inode, stat->st_size);
-                inode->i_blocks = stat->st_blocks;
-        } else {
-                if (stat->st_result_mask & P9_STATS_ATIME) {
-                        inode->i_atime.tv_sec = stat->st_atime_sec;
-                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_MTIME) {
-                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
-                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_CTIME) {
-                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
-                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
-                }
-                if (stat->st_result_mask & P9_STATS_UID)
-                        inode->i_uid = stat->st_uid;
-                if (stat->st_result_mask & P9_STATS_GID)
-                        inode->i_gid = stat->st_gid;
-                if (stat->st_result_mask & P9_STATS_NLINK)
-                        inode->i_nlink = stat->st_nlink;
-                if (stat->st_result_mask & P9_STATS_MODE) {
-                        inode->i_mode = stat->st_mode;
-                        if ((S_ISBLK(inode->i_mode)) ||
-                                                (S_ISCHR(inode->i_mode)))
-                                init_special_inode(inode, inode->i_mode,
-                                                                inode->i_rdev);
-                }
-                if (stat->st_result_mask & P9_STATS_RDEV)
-                        inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if (stat->st_result_mask & P9_STATS_SIZE)
-                        i_size_write(inode, stat->st_size);
-                if (stat->st_result_mask & P9_STATS_BLOCKS)
-                        inode->i_blocks = stat->st_blocks;
-        }
-        if (stat->st_result_mask & P9_STATS_GEN)
-                        inode->i_generation = stat->st_gen;
-        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
-         * because the inode structure does not have fields for them.
-         */
-}
-/**
 * v9fs_qid2ino - convert qid into inode number
 * @qid: qid to hash
 *
@@ -1602,7 +1096,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 *
 */
-static void
+void
 v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
        char *s = nd_get_link(nd);
@@ -1646,94 +1140,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 }
 /**
- * v9fs_vfs_symlink_dotl - helper function to create symlinks
- * @dir: directory inode containing symlink
- * @dentry: dentry for symlink
- * @symname: symlink data
- *
- * See Also: 9P2000.L RFC for more information
- *
- */
-static int
-v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
-                const char *symname)
-{
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *dfid;
-        struct p9_fid *fid = NULL;
-        struct inode *inode;
-        struct p9_qid qid;
-        char *name;
-        int err;
-        gid_t gid;
-        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
-                        dir->i_ino, name, symname);
-        v9ses = v9fs_inode2v9ses(dir);
-        dfid = v9fs_fid_lookup(dentry->d_parent);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                return err;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
-        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
-                goto error;
-        }
-        if (v9ses->cache) {
-                /* Now walk from the parent so we can get an unopened fid. */
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                        err);
-                        fid = NULL;
-                        goto error;
-                }
-                /* instantiate inode and assign the unopened fid to dentry */
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                        err);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /* Not in cached mode. No need to populate inode with stat */
-                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_dentry_operations);
-                d_instantiate(dentry, inode);
-        }
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-/**
 * v9fs_vfs_symlink - helper function to create symlinks
 * @dir: directory inode containing symlink
 * @dentry: dentry for symlink
@@ -1792,77 +1198,6 @@ clunk_fid:
 }
 /**
- * v9fs_vfs_link_dotl - create a hardlink for dotl
- * @old_dentry: dentry for file to link to
- * @dir: inode destination for new link
- * @dentry: dentry for link
- *
- */
-static int
-v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
-                struct dentry *dentry)
-{
-        int err;
-        struct p9_fid *dfid, *oldfid;
-        char *name;
-        struct v9fs_session_info *v9ses;
-        struct dentry *dir_dentry;
-        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
-                        dir->i_ino, old_dentry->d_name.name,
-                        dentry->d_name.name);
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid))
-                return PTR_ERR(dfid);
-        oldfid = v9fs_fid_lookup(old_dentry);
-        if (IS_ERR(oldfid))
-                return PTR_ERR(oldfid);
-        name = (char *) dentry->d_name.name;
-        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
-        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
-                return err;
-        }
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                /* Get the latest stat info from server. */
-                struct p9_fid *fid;
-                struct p9_stat_dotl *st;
-                fid = v9fs_fid_lookup(old_dentry);
-                if (IS_ERR(fid))
-                        return PTR_ERR(fid);
-                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-                if (IS_ERR(st))
-                        return PTR_ERR(st);
-                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-                kfree(st);
-        } else {
-                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just hold the
-                 * inode
-                 */
-                ihold(old_dentry->d_inode);
-        }
-        d_set_d_op(dentry, old_dentry->d_op);
-        d_instantiate(dentry, old_dentry->d_inode);
-        return err;
-}
-/**
 * v9fs_vfs_mknod - create a special file
 * @dir: inode destination for new link
 * @dentry: dentry for file
@@ -1907,160 +1242,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
-/**
- * v9fs_vfs_mknod_dotl - create a special file
- * @dir: inode destination for new link
- * @dentry: dentry for file
- * @mode: mode for creation
- * @rdev: device associated with special file
- *
- */
-static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
-                dev_t rdev)
-{
-        int err;
-        char *name;
-        mode_t mode;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid = NULL, *dfid = NULL;
-        struct inode *inode;
-        gid_t gid;
-        struct p9_qid qid;
-        struct dentry *dir_dentry;
-        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS,
-                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
-        if (!new_valid_dev(rdev))
-                return -EINVAL;
-        v9ses = v9fs_inode2v9ses(dir);
-        dir_dentry = v9fs_dentry_from_dir_inode(dir);
-        dfid = v9fs_fid_lookup(dir_dentry);
-        if (IS_ERR(dfid)) {
-                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
-                goto error;
-        }
-        gid = v9fs_get_fsgid_for_create(dir);
-        mode = omode;
-        /* Update mode based on ACL value */
-        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
-        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
-                           "Failed to get acl values in mknod %d\n", err);
-                goto error;
-        }
-        name = (char *) dentry->d_name.name;
-        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
-        if (err < 0)
-                goto error;
-        /* instantiate inode and assign the unopened fid to the dentry */
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                fid = p9_client_walk(dfid, 1, &name, 1);
-                if (IS_ERR(fid)) {
-                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
-                        fid = NULL;
-                        goto error;
-                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
-                d_instantiate(dentry, inode);
-                err = v9fs_fid_add(dentry, fid);
-                if (err < 0)
-                        goto error;
-                fid = NULL;
-        } else {
-                /*
-                 * Not in cached mode. No need to populate inode with stat.
-                 * socket syscall returns a fd, so we need instantiate
-                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
-                if (IS_ERR(inode)) {
-                        err = PTR_ERR(inode);
-                        goto error;
-                }
-                d_set_d_op(dentry, &v9fs_dentry_operations);
-                d_instantiate(dentry, inode);
-        }
-        /* Now set the ACL based on the default value */
-        v9fs_set_create_acl(dentry, dacl, pacl);
-error:
-        if (fid)
-                p9_client_clunk(fid);
-        return err;
-}
-static int
-v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
-{
-        int retval;
-        struct p9_fid *fid;
-        char *target = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
-        retval = -EPERM;
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
-        retval = p9_client_readlink(fid, &target);
-        if (retval < 0)
-                return retval;
-        strncpy(buffer, target, buflen);
-        P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
-        retval = strnlen(buffer, buflen);
-        return retval;
-}
-/**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- *
- */
-static void *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
-{
-        int len = 0;
-        char *link = __getname();
-        P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
-        if (!link)
-                link = ERR_PTR(-ENOMEM);
-        else {
-                len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
-                if (len < 0) {
-                        __putname(link);
-                        link = ERR_PTR(len);
-                } else
-                        link[min(len, PATH_MAX-1)] = 0;
-        }
-        nd_set_link(nd, link);
-        return NULL;
-}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -2075,25 +1256,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_dir_inode_operations_dotl = {
-        .create = v9fs_vfs_create_dotl,
-        .lookup = v9fs_vfs_lookup,
-        .link = v9fs_vfs_link_dotl,
-        .symlink = v9fs_vfs_symlink_dotl,
-        .unlink = v9fs_vfs_unlink,
-        .mkdir = v9fs_vfs_mkdir_dotl,
-        .rmdir = v9fs_vfs_rmdir,
-        .mknod = v9fs_vfs_mknod_dotl,
-        .rename = v9fs_vfs_rename,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-        .check_acl = v9fs_check_acl,
-};
 static const struct inode_operations v9fs_dir_inode_operations = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -2111,16 +1273,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_file_inode_operations_dotl = {
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-        .check_acl = v9fs_check_acl,
-};
 static const struct inode_operations v9fs_symlink_inode_operations = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
@@ -2129,14 +1281,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
-static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-        .readlink = v9fs_vfs_readlink_dotl,
-        .follow_link = v9fs_vfs_follow_link_dotl,
-        .put_link = v9fs_vfs_put_link,
-        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr_dotl,
-        .setxattr = generic_setxattr,
-        .getxattr = generic_getxattr,
-        .removexattr = generic_removexattr,
-        .listxattr = v9fs_listxattr,
-};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 000000000000..fe3ffa9aace4
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,824 @@
+/*
+ *  linux/fs/9p/vfs_inode_dotl.c
+ *
+ * This file contains vfs inode ops for the 9P2000.L protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "v9fs.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+#include "cache.h"
+#include "xattr.h"
+#include "acl.h"
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                    dev_t rdev);
+/**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+        BUG_ON(dir_inode == NULL);
+        if (dir_inode->i_mode & S_ISGID) {
+                /* set_gid bit is set.*/
+                return dir_inode->i_gid;
+        }
+        return current_fsgid();
+}
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+        struct dentry *dentry;
+        spin_lock(&inode->i_lock);
+        /* Directory should have only one entry. */
+        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+        spin_unlock(&inode->i_lock);
+        return dentry;
+}
+struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+        struct super_block *sb)
+{
+        struct inode *ret = NULL;
+        int err;
+        struct p9_stat_dotl *st;
+        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        ret = v9fs_get_inode(sb, st->st_mode);
+        if (IS_ERR(ret)) {
+                err = PTR_ERR(ret);
+                goto error;
+        }
+        v9fs_stat2inode_dotl(st, ret);
+        ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_cache_inode_get_cookie(ret);
+#endif
+        err = v9fs_get_acl(ret, fid);
+        if (err) {
+                iput(ret);
+                goto error;
+        }
+        kfree(st);
+        return ret;
+error:
+        kfree(st);
+        return ERR_PTR(err);
+}
+/**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                struct nameidata *nd)
+{
+        int err = 0;
+        char *name = NULL;
+        gid_t gid;
+        int flags;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL;
+        struct p9_fid *dfid, *ofid;
+        struct file *filp;
+        struct p9_qid qid;
+        struct inode *inode;
+        struct posix_acl *pacl = NULL, *dacl = NULL;
+        v9ses = v9fs_inode2v9ses(dir);
+        if (nd && nd->flags & LOOKUP_OPEN)
+                flags = nd->intent.open.flags - 1;
+        else {
+                /*
+                 * create call without LOOKUP_OPEN is due
+                 * to mknod of regular files. So use mknod
+                 * operation.
+                 */
+                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+        }
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+                        "mode:0x%x\n", name, flags, omode);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        /* clone a fid to use for creation */
+        ofid = p9_client_walk(dfid, 0, NULL, 1);
+        if (IS_ERR(ofid)) {
+                err = PTR_ERR(ofid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in creat %d\n", err);
+                goto error;
+        }
+        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                                "p9_client_open_dotl failed in creat %d\n",
+                                err);
+                goto error;
+        }
+        /* instantiate inode and assign the unopened fid to the dentry */
+        fid = p9_client_walk(dfid, 1, &name, 1);
+        if (IS_ERR(fid)) {
+                err = PTR_ERR(fid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                fid = NULL;
+                goto error;
+        }
+        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+                goto error;
+        }
+        d_instantiate(dentry, inode);
+        err = v9fs_fid_add(dentry, fid);
+        if (err < 0)
+                goto error;
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+        /* Since we are opening a file, assign the open fid to the file */
+        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
+        if (IS_ERR(filp)) {
+                p9_client_clunk(ofid);
+                return PTR_ERR(filp);
+        }
+        filp->private_data = ofid;
+        return 0;
+error:
+        if (ofid)
+                p9_client_clunk(ofid);
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+                               struct dentry *dentry, int omode)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        gid_t gid;
+        char *name;
+        mode_t mode;
+        struct inode *inode;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        err = 0;
+        v9ses = v9fs_inode2v9ses(dir);
+        omode |= S_IFDIR;
+        if (dir->i_mode & S_ISGID)
+                omode |= S_ISGID;
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mkdir %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate
+                 * inode with stat. We need to get an inode
+                 * so that we can set the acl with dentry
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_stat_dotl *st;
+        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+        err = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+                return simple_getattr(mnt, dentry, stat);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        /* Ask for all the fields in stat structure. Server will return
+         * whatever it supports
+         */
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        v9fs_stat2inode_dotl(st, dentry->d_inode);
+        generic_fillattr(dentry->d_inode, stat);
+        /* Change block size to what the server returned */
+        stat->blksize = st->st_blksize;
+        kfree(st);
+        return 0;
+}
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+        int retval;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_iattr_dotl p9attr;
+        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        retval = inode_change_ok(dentry->d_inode, iattr);
+        if (retval)
+                return retval;
+        p9attr.valid = iattr->ia_valid;
+        p9attr.mode = iattr->ia_mode;
+        p9attr.uid = iattr->ia_uid;
+        p9attr.gid = iattr->ia_gid;
+        p9attr.size = iattr->ia_size;
+        p9attr.atime_sec = iattr->ia_atime.tv_sec;
+        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+        retval = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        retval = p9_client_setattr(fid, &p9attr);
+        if (retval < 0)
+                return retval;
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(dentry->d_inode)) {
+                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                if (retval)
+                        return retval;
+        }
+        setattr_copy(dentry->d_inode, iattr);
+        mark_inode_dirty(dentry->d_inode);
+        if (iattr->ia_valid & ATTR_MODE) {
+                /* We also want to update ACL when we update mode bits */
+                retval = v9fs_acl_chmod(dentry);
+                if (retval < 0)
+                        return retval;
+        }
+        return 0;
+}
+/**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+                inode->i_atime.tv_sec = stat->st_atime_sec;
+                inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                inode->i_uid = stat->st_uid;
+                inode->i_gid = stat->st_gid;
+                inode->i_nlink = stat->st_nlink;
+                inode->i_mode = stat->st_mode;
+                inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                i_size_write(inode, stat->st_size);
+                inode->i_blocks = stat->st_blocks;
+        } else {
+                if (stat->st_result_mask & P9_STATS_ATIME) {
+                        inode->i_atime.tv_sec = stat->st_atime_sec;
+                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_MTIME) {
+                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_CTIME) {
+                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_UID)
+                        inode->i_uid = stat->st_uid;
+                if (stat->st_result_mask & P9_STATS_GID)
+                        inode->i_gid = stat->st_gid;
+                if (stat->st_result_mask & P9_STATS_NLINK)
+                        inode->i_nlink = stat->st_nlink;
+                if (stat->st_result_mask & P9_STATS_MODE) {
+                        inode->i_mode = stat->st_mode;
+                        if ((S_ISBLK(inode->i_mode)) ||
+                                                (S_ISCHR(inode->i_mode)))
+                                init_special_inode(inode, inode->i_mode,
+                                                                inode->i_rdev);
+                }
+                if (stat->st_result_mask & P9_STATS_RDEV)
+                        inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if (stat->st_result_mask & P9_STATS_SIZE)
+                        i_size_write(inode, stat->st_size);
+                if (stat->st_result_mask & P9_STATS_BLOCKS)
+                        inode->i_blocks = stat->st_blocks;
+        }
+        if (stat->st_result_mask & P9_STATS_GEN)
+                        inode->i_generation = stat->st_gen;
+        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+         * because the inode structure does not have fields for them.
+         */
+}
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+                const char *symname)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct inode *inode;
+        struct p9_qid qid;
+        char *name;
+        int err;
+        gid_t gid;
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+                        dir->i_ino, name, symname);
+        v9ses = v9fs_inode2v9ses(dir);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+                goto error;
+        }
+        if (v9ses->cache) {
+                /* Now walk from the parent so we can get an unopened fid. */
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                        err);
+                        fid = NULL;
+                        goto error;
+                }
+                /* instantiate inode and assign the unopened fid to dentry */
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                        err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /* Not in cached mode. No need to populate inode with stat */
+                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *dfid, *oldfid;
+        char *name;
+        struct v9fs_session_info *v9ses;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+                        dir->i_ino, old_dentry->d_name.name,
+                        dentry->d_name.name);
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid))
+                return PTR_ERR(dfid);
+        oldfid = v9fs_fid_lookup(old_dentry);
+        if (IS_ERR(oldfid))
+                return PTR_ERR(oldfid);
+        name = (char *) dentry->d_name.name;
+        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+                return err;
+        }
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                /* Get the latest stat info from server. */
+                struct p9_fid *fid;
+                struct p9_stat_dotl *st;
+                fid = v9fs_fid_lookup(old_dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                if (IS_ERR(st))
+                        return PTR_ERR(st);
+                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+                kfree(st);
+        } else {
+                /* Caching disabled. No need to get upto date stat info.
+                 * This dentry will be released immediately. So, just hold the
+                 * inode
+                 */
+                ihold(old_dentry->d_inode);
+        }
+        d_instantiate(dentry, old_dentry->d_inode);
+        return err;
+}
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                dev_t rdev)
+{
+        int err;
+        char *name;
+        mode_t mode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        struct inode *inode;
+        gid_t gid;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS,
+                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mknod %d\n", err);
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate inode with stat.
+                 * socket syscall returns a fd, so we need instantiate
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                d_instantiate(dentry, inode);
+        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+        int retval;
+        struct p9_fid *fid;
+        char *link = __getname();
+        char *target;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+        if (!link) {
+                link = ERR_PTR(-ENOMEM);
+                goto ndset;
+        }
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid)) {
+                __putname(link);
+                link = ERR_PTR(PTR_ERR(fid));
+                goto ndset;
+        }
+        retval = p9_client_readlink(fid, &target);
+        if (!retval) {
+                strcpy(link, target);
+                kfree(target);
+                goto ndset;
+        }
+        __putname(link);
+        link = ERR_PTR(retval);
+ndset:
+        nd_set_link(nd, link);
+        return NULL;
+}
+const struct inode_operations v9fs_dir_inode_operations_dotl = {
+        .create = v9fs_vfs_create_dotl,
+        .lookup = v9fs_vfs_lookup,
+        .link = v9fs_vfs_link_dotl,
+        .symlink = v9fs_vfs_symlink_dotl,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir_dotl,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod_dotl,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_file_inode_operations_dotl = {
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
+};
+const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+        .readlink = generic_readlink,
+        .follow_link = v9fs_vfs_follow_link_dotl,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+};
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 43ec7df84336..d288773871b3 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -133,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
                        "p9_client_xattrcreate failed %d\n", retval);
                goto error;
        }
-        msize = fid->clnt->msize;;
+        msize = fid->clnt->msize;
        while (value_len) {
                if (value_len > (msize - P9_IOHDRSZ))
                        write_count = msize - P9_IOHDRSZ;
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 224d7bbd1fcc..e654dfd092c3 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
                                   void *buffer, uint16_t maxbuf)
 {
        const struct TCP_Server_Info *server = cookie_netfs_data;
-        const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+        const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+        const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+        const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
        struct cifs_server_key *key = buffer;
        uint16_t key_len = sizeof(struct cifs_server_key);
@@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
         */
        switch (sa->sa_family) {
        case AF_INET:
-                key->family = server->addr.sockAddr.sin_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr.sin_port;
+                key->port = addr->sin_port;
-                key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+                key->addr[0].ipv4_addr = addr->sin_addr;
                key_len += sizeof(key->addr[0].ipv4_addr);
                break;
        case AF_INET6:
-                key->family = server->addr.sockAddr6.sin6_family;
+                key->family = sa->sa_family;
-                key->port = server->addr.sockAddr6.sin6_port;
+                key->port = addr6->sin6_port;
-                key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+                key->addr[0].ipv6_addr = addr6->sin6_addr;
                key_len += sizeof(key->addr[0].ipv6_addr);
                break;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 103ab8b605b0..ede98300a8cd 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -119,29 +119,27 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                    "Display Internal CIFS Data Structures for Debugging\n"
                    "---------------------------------------------------\n");
        seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
-        seq_printf(m, "Features: ");
+        seq_printf(m, "Features:");
 #ifdef CONFIG_CIFS_DFS_UPCALL
-        seq_printf(m, "dfs");
+        seq_printf(m, " dfs");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_FSCACHE
-        seq_printf(m, "fscache");
+        seq_printf(m, " fscache");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-        seq_printf(m, "lanman");
+        seq_printf(m, " lanman");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_POSIX
-        seq_printf(m, "posix");
+        seq_printf(m, " posix");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_UPCALL
-        seq_printf(m, "spnego");
+        seq_printf(m, " spnego");
-        seq_putc(m, ' ');
 #endif
 #ifdef CONFIG_CIFS_XATTR
-        seq_printf(m, "xattr");
+        seq_printf(m, " xattr");
+#endif
+#ifdef CONFIG_CIFS_ACL
+        seq_printf(m, " acl");
 #endif
        seq_putc(m, '\n');
        seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 87044906cd1f..4dfba8283165 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,6 +98,8 @@ struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 {
        struct TCP_Server_Info *server = sesInfo->server;
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        char *description, *dp;
        size_t desc_len;
        struct key *spnego_key;
@@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        /* add the server address */
-        if (server->addr.sockAddr.sin_family == AF_INET)
+        if (server->dstaddr.ss_family == AF_INET)
-                sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
+                sprintf(dp, "ip4=%pI4", &sa->sin_addr);
-        else if (server->addr.sockAddr.sin_family == AF_INET6)
+        else if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
+                sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
        else
                goto out;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index f856732161ab..66f3d50d0676 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -72,6 +72,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
        return 0;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
                  __u32 *pexpected_response_sequence_number)
 {
@@ -84,14 +85,12 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                        cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
        rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
        if (rc)
@@ -149,6 +148,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
        return rc;
 }
+/* must be called with server->srv_mutex held */
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
                   __u32 *pexpected_response_sequence_number)
 {
@@ -162,14 +162,12 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
                return rc;
-        spin_lock(&GlobalMid_Lock);
        cifs_pdu->Signature.Sequence.SequenceNumber =
                                cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        spin_unlock(&GlobalMid_Lock);
        rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
        if (rc)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8e21e0fe65d5..5e7075d5f139 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -329,6 +329,8 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_inode->invalid_mapping = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
+        cifs_inode->uniqueid = 0;
+        cifs_inode->createtime = 0;
        /* Can not set i_flags here - they get immediately overwritten
           to zero by the VFS */
@@ -361,18 +363,19 @@ cifs_evict_inode(struct inode *inode)
 static void
 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 {
+        struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
+        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
        seq_printf(s, ",addr=");
-        switch (server->addr.sockAddr.sin_family) {
+        switch (server->dstaddr.ss_family) {
        case AF_INET:
-                seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr);
+                seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
                break;
        case AF_INET6:
-                seq_printf(s, "%pI6",
+                seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
-                           &server->addr.sockAddr6.sin6_addr.s6_addr);
+                if (sa6->sin6_scope_id)
-                if (server->addr.sockAddr6.sin6_scope_id)
+                        seq_printf(s, "%%%u", sa6->sin6_scope_id);
-                        seq_printf(s, "%%%u",
-                                   server->addr.sockAddr6.sin6_scope_id);
                break;
        default:
                seq_printf(s, "(unknown)");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7136c0c3e2f9..606ca8bb7102 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -163,10 +163,7 @@ struct TCP_Server_Info {
        char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
        char *hostname; /* hostname portion of UNC string */
        struct socket *ssocket;
-        union {
+        struct sockaddr_storage dstaddr;
-                struct sockaddr_in sockAddr;
-                struct sockaddr_in6 sockAddr6;
-        } addr;
        struct sockaddr_storage srcaddr; /* locally bind to this IP */
        wait_queue_head_t response_q;
        wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
@@ -210,7 +207,7 @@ struct TCP_Server_Info {
        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
-        __u32 sequence_number; /* needed for CIFS PDU signature */
+        __u32 sequence_number; /* for signing, protected by srv_mutex */
        struct session_key session_key;
        unsigned long lstrp; /* when we got last response from this server */
        u16 dialect; /* dialect index that server chose */
@@ -456,6 +453,7 @@ struct cifsInodeInfo {
        bool invalid_mapping:1;         /* pagecache is invalid */
        u64  server_eof;                /* current file size on server */
        u64  uniqueid;                  /* server inode number */
+        u64  createtime;                /* creation time on server */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie *fscache;
 #endif
@@ -576,6 +574,7 @@ struct cifs_fattr {
        u64             cf_uniqueid;
        u64             cf_eof;
        u64             cf_bytes;
+        u64             cf_createtime;
        uid_t           cf_uid;
        gid_t           cf_gid;
        umode_t         cf_mode;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 67acfb3acad2..2f6795e524d3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -401,15 +401,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
                cFYI(1, "Kerberos only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        }
+        } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
                cFYI(1, "NTLMSSP only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
-#endif
        count = 0;
        for (i = 0; i < CIFS_NUM_PROT; i++) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index cc1a8604a790..a65d311d163a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -64,8 +64,8 @@ struct smb_vol {
        char *UNC;
        char *UNCip;
        char *iocharset;  /* local code page for mapping to and from Unicode */
-        char source_rfc1001_name[16]; /* netbios name of client */
+        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
-        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
        uid_t cred_uid;
        uid_t linux_uid;
        gid_t linux_gid;
@@ -115,8 +115,8 @@ struct smb_vol {
 #define TLINK_ERROR_EXPIRE      (1 * HZ)
 #define TLINK_IDLE_EXPIRE       (600 * HZ)
-static int ipv4_connect(struct TCP_Server_Info *server);
+static int ip_connect(struct TCP_Server_Info *server);
-static int ipv6_connect(struct TCP_Server_Info *server);
+static int generic_ip_connect(struct TCP_Server_Info *server);
 static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
 static void cifs_prune_tlinks(struct work_struct *work);
@@ -200,10 +200,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
        while ((server->tcpStatus != CifsExiting) &&
               (server->tcpStatus != CifsGood)) {
                try_to_freeze();
-                if (server->addr.sockAddr6.sin6_family == AF_INET6)
-                        rc = ipv6_connect(server);
+                /* we should try only the port we connected to before */
-                else
+                rc = generic_ip_connect(server);
-                        rc = ipv4_connect(server);
                if (rc) {
                        cFYI(1, "reconnect error %d", rc);
                        msleep(3000);
@@ -477,7 +476,7 @@ incomplete_rcv:
                         * initialize frame)
                         */
                        cifs_set_port((struct sockaddr *)
-                                        &server->addr.sockAddr, CIFS_PORT);
+                                        &server->dstaddr, CIFS_PORT);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
@@ -817,11 +816,11 @@ cifs_parse_mount_options(char *options, const char *devname,
         * informational, only used for servers that do not support
         * port 445 and it can be overridden at mount time
         */
-        memset(vol->source_rfc1001_name, 0x20, 15);
+        memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
-        for (i = 0; i < strnlen(nodename, 15); i++)
+        for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
                vol->source_rfc1001_name[i] = toupper(nodename[i]);
-        vol->source_rfc1001_name[15] = 0;
+        vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
        vol->target_rfc1001_name[0] = 0;
@@ -985,13 +984,11 @@ cifs_parse_mount_options(char *options, const char *devname,
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                        } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
                                        CIFSSEC_MUST_SIGN;
                        } else if (strnicmp(value, "ntlmssp", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
-#endif
                        } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
                                        CIFSSEC_MUST_SIGN;
@@ -1168,22 +1165,22 @@ cifs_parse_mount_options(char *options, const char *devname,
                        if (!value || !*value || (*value == ' ')) {
                                cFYI(1, "invalid (empty) netbiosname");
                        } else {
-                                memset(vol->source_rfc1001_name, 0x20, 15);
+                                memset(vol->source_rfc1001_name, 0x20,
-                                for (i = 0; i < 15; i++) {
+                                        RFC1001_NAME_LEN);
-                                /* BB are there cases in which a comma can be
+                                /*
-                                valid in this workstation netbios name (and need
+                                 * FIXME: are there cases in which a comma can
-                                special handling)? */
+                                 * be valid in workstation netbios name (and
+                                 * need special handling)?
-                                /* We do not uppercase netbiosname for user */
+                                 */
+                                for (i = 0; i < RFC1001_NAME_LEN; i++) {
+                                        /* don't ucase netbiosname for user */
                                        if (value[i] == 0)
                                                break;
-                                        else
+                                        vol->source_rfc1001_name[i] = value[i];
-                                                vol->source_rfc1001_name[i] =
-                                                                value[i];
                                }
                                /* The string has 16th byte zero still from
                                set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: netbiosname"
                                                " longer than 15 truncated.\n");
                        }
@@ -1193,7 +1190,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                                cFYI(1, "empty server netbiosname specified");
                        } else {
                                /* last byte, type, is 0x20 for servr type */
-                                memset(vol->target_rfc1001_name, 0x20, 16);
+                                memset(vol->target_rfc1001_name, 0x20,
+                                        RFC1001_NAME_LEN_WITH_NULL);
                                for (i = 0; i < 15; i++) {
                                /* BB are there cases in which a comma can be
@@ -1210,7 +1208,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                }
                                /* The string has 16th byte zero still from
                                   set at top of the function  */
-                                if ((i == 15) && (value[i] != 0))
+                                if (i == RFC1001_NAME_LEN && value[i] != 0)
                                        printk(KERN_WARNING "CIFS: server net"
                                        "biosname longer than 15 truncated.\n");
                        }
@@ -1341,10 +1339,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->no_psx_acl = 0;
                } else if (strnicmp(data, "noacl", 5) == 0) {
                        vol->no_psx_acl = 1;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
                } else if (strnicmp(data, "locallease", 6) == 0) {
                        vol->local_lease = 1;
-#endif
                } else if (strnicmp(data, "sign", 4) == 0) {
                        vol->secFlg |= CIFSSEC_MUST_SIGN;
                } else if (strnicmp(data, "seal", 4) == 0) {
@@ -1454,35 +1450,71 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
        }
 }
+/*
+ * If no port is specified in addr structure, we try to match with 445 port
+ * and if it fails - with 139 ports. It should be called only if address
+ * families of server and addr are equal.
+ */
+static bool
+match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+        unsigned short int port, *sport;
+        switch (addr->sa_family) {
+        case AF_INET:
+                sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
+                port = ((struct sockaddr_in *) addr)->sin_port;
+                break;
+        case AF_INET6:
+                sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
+                port = ((struct sockaddr_in6 *) addr)->sin6_port;
+                break;
+        default:
+                WARN_ON(1);
+                return false;
+        }
+        if (!port) {
+                port = htons(CIFS_PORT);
+                if (port == *sport)
+                        return true;
+                port = htons(RFC1001_PORT);
+        }
+        return port == *sport;
+}
 static bool
 match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
              struct sockaddr *srcaddr)
 {
-        struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
-        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
        switch (addr->sa_family) {
-        case AF_INET:
+        case AF_INET: {
-                if (addr4->sin_addr.s_addr !=
+                struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
-                    server->addr.sockAddr.sin_addr.s_addr)
+                struct sockaddr_in *srv_addr4 =
-                        return false;
+                                        (struct sockaddr_in *)&server->dstaddr;
-                if (addr4->sin_port &&
-                    addr4->sin_port != server->addr.sockAddr.sin_port)
+                if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
                        return false;
                break;
-        case AF_INET6:
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+                struct sockaddr_in6 *srv_addr6 =
+                                        (struct sockaddr_in6 *)&server->dstaddr;
                if (!ipv6_addr_equal(&addr6->sin6_addr,
-                                     &server->addr.sockAddr6.sin6_addr))
+                                     &srv_addr6->sin6_addr))
                        return false;
-                if (addr6->sin6_scope_id !=
+                if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
-                    server->addr.sockAddr6.sin6_scope_id)
-                        return false;
-                if (addr6->sin6_port &&
-                    addr6->sin6_port != server->addr.sockAddr6.sin6_port)
                        return false;
                break;
        }
+        default:
+                WARN_ON(1);
+                return false; /* don't expect to be here */
+        }
        if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
                return false;
@@ -1549,6 +1581,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
                                   (struct sockaddr *)&vol->srcaddr))
                        continue;
+                if (!match_port(server, addr))
+                        continue;
                if (!match_security(server, vol))
                        continue;
@@ -1681,14 +1716,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                cFYI(1, "attempting ipv6 connect");
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
-                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
+                memcpy(&tcp_ses->dstaddr, sin_server6,
-                        sizeof(struct sockaddr_in6));
+                       sizeof(struct sockaddr_in6));
-                rc = ipv6_connect(tcp_ses);
+        } else
-        } else {
+                memcpy(&tcp_ses->dstaddr, sin_server,
-                memcpy(&tcp_ses->addr.sockAddr, sin_server,
+                       sizeof(struct sockaddr_in));
-                        sizeof(struct sockaddr_in));
-                rc = ipv4_connect(tcp_ses);
+        rc = ip_connect(tcp_ses);
-        }
        if (rc < 0) {
                cERROR(1, "Error connecting to socket. Aborting operation");
                goto out_err_crypto_release;
@@ -1793,6 +1827,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
        int rc = -ENOMEM, xid;
        struct cifsSesInfo *ses;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
        xid = GetXid();
@@ -1836,12 +1872,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        /* new SMB session uses our server ref */
        ses->server = server;
-        if (server->addr.sockAddr6.sin6_family == AF_INET6)
+        if (server->dstaddr.ss_family == AF_INET6)
-                sprintf(ses->serverName, "%pI6",
+                sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
-                        &server->addr.sockAddr6.sin6_addr);
        else
-                sprintf(ses->serverName, "%pI4",
+                sprintf(ses->serverName, "%pI4", &addr->sin_addr);
-                        &server->addr.sockAddr.sin_addr.s_addr);
        if (volume_info->username)
                strncpy(ses->userName, volume_info->username,
@@ -2136,19 +2170,106 @@ bind_socket(struct TCP_Server_Info *server)
 }
 static int
-ipv4_connect(struct TCP_Server_Info *server)
+ip_rfc1001_connect(struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        /*
+         * some servers require RFC1001 sessinit before sending
+         * negprot - BB check reconnection in case where second
+         * sessinit is sent but no second negprot
+         */
+        struct rfc1002_session_packet *ses_init_buf;
+        struct smb_hdr *smb_buf;
+        ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
+                               GFP_KERNEL);
+        if (ses_init_buf) {
+                ses_init_buf->trailer.session_req.called_len = 32;
+                if (server->server_RFC1001_name &&
+                    server->server_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      server->server_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.called_name,
+                                      DEFAULT_CIFS_CALLED_NAME,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.calling_len = 32;
+                /*
+                 * calling name ends in null (byte 16) from old smb
+                 * convention.
+                 */
+                if (server->workstation_RFC1001_name &&
+                    server->workstation_RFC1001_name[0] != 0)
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      server->workstation_RFC1001_name,
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                else
+                        rfc1002mangle(ses_init_buf->trailer.
+                                      session_req.calling_name,
+                                      "LINUX_CIFS_CLNT",
+                                      RFC1001_NAME_LEN_WITH_NULL);
+                ses_init_buf->trailer.session_req.scope1 = 0;
+                ses_init_buf->trailer.session_req.scope2 = 0;
+                smb_buf = (struct smb_hdr *)ses_init_buf;
+                /* sizeof RFC1002_SESSION_REQUEST with no scope */
+                smb_buf->smb_buf_length = 0x81000044;
+                rc = smb_send(server, smb_buf, 0x44);
+                kfree(ses_init_buf);
+                /*
+                 * RFC1001 layer in at least one server
+                 * requires very short break before negprot
+                 * presumably because not expecting negprot
+                 * to follow so fast.  This is a simple
+                 * solution that works without
+                 * complicating the code and causes no
+                 * significant slowing down on mount
+                 * for everyone else
+                 */
+                usleep_range(1000, 2000);
+        }
+        /*
+         * else the negprot may still work without this
+         * even though malloc failed
+         */
+        return rc;
+}
+static int
+generic_ip_connect(struct TCP_Server_Info *server)
 {
        int rc = 0;
-        int val;
+        unsigned short int sport;
-        bool connected = false;
+        int slen, sfamily;
-        __be16 orig_port = 0;
        struct socket *socket = server->ssocket;
+        struct sockaddr *saddr;
+        saddr = (struct sockaddr *) &server->dstaddr;
+        if (server->dstaddr.ss_family == AF_INET6) {
+                sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
+                slen = sizeof(struct sockaddr_in6);
+                sfamily = AF_INET6;
+        } else {
+                sport = ((struct sockaddr_in *) saddr)->sin_port;
+                slen = sizeof(struct sockaddr_in);
+                sfamily = AF_INET;
+        }
        if (socket == NULL) {
-                rc = sock_create_kern(PF_INET, SOCK_STREAM,
+                rc = sock_create_kern(sfamily, SOCK_STREAM,
                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
                        cERROR(1, "Error %d creating socket", rc);
+                        server->ssocket = NULL;
                        return rc;
                }
@@ -2156,63 +2277,28 @@ ipv4_connect(struct TCP_Server_Info *server)
                cFYI(1, "Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket4(socket);
+                if (sfamily == AF_INET6)
+                        cifs_reclassify_socket6(socket);
+                else
+                        cifs_reclassify_socket4(socket);
        }
        rc = bind_socket(server);
        if (rc < 0)
                return rc;
-        /* user overrode default port */
+        rc = socket->ops->connect(socket, saddr, slen, 0);
-        if (server->addr.sockAddr.sin_port) {
+        if (rc < 0) {
-                rc = socket->ops->connect(socket, (struct sockaddr *)
+                cFYI(1, "Error %d connecting to server", rc);
-                                          &server->addr.sockAddr,
-                                          sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr.sin_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr.sin_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket,
-                                                (struct sockaddr *)
-                                                &server->addr.sockAddr,
-                                                sizeof(struct sockaddr_in), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                              &server->addr.sockAddr,
-                                              sizeof(struct sockaddr_in), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        /* give up here - unless we want to retry on different
-                protocol families some day */
-        if (!connected) {
-                if (orig_port)
-                        server->addr.sockAddr.sin_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv4", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
        }
        /*
         * Eventually check for other socket options to change from
-         *  the default. sock_setsockopt not used because it expects
+         * the default. sock_setsockopt not used because it expects
-         *  user space buffer
+         * user space buffer
         */
        socket->sk->sk_rcvtimeo = 7 * HZ;
        socket->sk->sk_sndtimeo = 5 * HZ;
@@ -2226,7 +2312,7 @@ ipv4_connect(struct TCP_Server_Info *server)
        }
        if (server->tcp_nodelay) {
-                val = 1;
+                int val = 1;
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
@@ -2237,161 +2323,39 @@ ipv4_connect(struct TCP_Server_Info *server)
                 socket->sk->sk_sndbuf,
                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
-        /* send RFC1001 sessinit */
+        if (sport == htons(RFC1001_PORT))
-        if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
+                rc = ip_rfc1001_connect(server);
-                /* some servers require RFC1001 sessinit before sending
-                negprot - BB check reconnection in case where second
-                sessinit is sent but no second negprot */
-                struct rfc1002_session_packet *ses_init_buf;
-                struct smb_hdr *smb_buf;
-                ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
-                                       GFP_KERNEL);
-                if (ses_init_buf) {
-                        ses_init_buf->trailer.session_req.called_len = 32;
-                        if (server->server_RFC1001_name &&
-                            server->server_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              server->server_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.called_name,
-                                              DEFAULT_CIFS_CALLED_NAME,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.calling_len = 32;
-                        /* calling name ends in null (byte 16) from old smb
-                        convention. */
-                        if (server->workstation_RFC1001_name &&
-                            server->workstation_RFC1001_name[0] != 0)
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              server->workstation_RFC1001_name,
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        else
-                                rfc1002mangle(ses_init_buf->trailer.
-                                                session_req.calling_name,
-                                              "LINUX_CIFS_CLNT",
-                                              RFC1001_NAME_LEN_WITH_NULL);
-                        ses_init_buf->trailer.session_req.scope1 = 0;
-                        ses_init_buf->trailer.session_req.scope2 = 0;
-                        smb_buf = (struct smb_hdr *)ses_init_buf;
-                        /* sizeof RFC1002_SESSION_REQUEST with no scope */
-                        smb_buf->smb_buf_length = 0x81000044;
-                        rc = smb_send(server, smb_buf, 0x44);
-                        kfree(ses_init_buf);
-                        msleep(1); /* RFC1001 layer in at least one server
-                                      requires very short break before negprot
-                                      presumably because not expecting negprot
-                                      to follow so fast.  This is a simple
-                                      solution that works without
-                                      complicating the code and causes no
-                                      significant slowing down on mount
-                                      for everyone else */
-                }
-                /* else the negprot may still work without this
-                even though malloc failed */
-        }
        return rc;
 }
 static int
-ipv6_connect(struct TCP_Server_Info *server)
+ip_connect(struct TCP_Server_Info *server)
 {
-        int rc = 0;
+        unsigned short int *sport;
-        int val;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
-        bool connected = false;
+        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
-        __be16 orig_port = 0;
-        struct socket *socket = server->ssocket;
-        if (socket == NULL) {
+        if (server->dstaddr.ss_family == AF_INET6)
-                rc = sock_create_kern(PF_INET6, SOCK_STREAM,
+                sport = &addr6->sin6_port;
-                                      IPPROTO_TCP, &socket);
+        else
-                if (rc < 0) {
+                sport = &addr->sin_port;
-                        cERROR(1, "Error %d creating ipv6 socket", rc);
-                        socket = NULL;
-                        return rc;
-                }
-                /* BB other socket options to set KEEPALIVE, NODELAY? */
+        if (*sport == 0) {
-                cFYI(1, "ipv6 Socket created");
+                int rc;
-                server->ssocket = socket;
-                socket->sk->sk_allocation = GFP_NOFS;
-                cifs_reclassify_socket6(socket);
-        }
-        rc = bind_socket(server);
+                /* try with 445 port at first */
-        if (rc < 0)
+                *sport = htons(CIFS_PORT);
-                return rc;
-        /* user overrode default port */
+                rc = generic_ip_connect(server);
-        if (server->addr.sockAddr6.sin6_port) {
-                rc = socket->ops->connect(socket,
-                                (struct sockaddr *) &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
-                if (rc >= 0)
-                        connected = true;
-        }
-        if (!connected) {
-                /* save original port so we can retry user specified port
-                        later if fall back ports fail this time  */
-                orig_port = server->addr.sockAddr6.sin6_port;
-                /* do not retry on the same port we just failed on */
-                if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
-                        server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
-                        rc = socket->ops->connect(socket, (struct sockaddr *)
-                                        &server->addr.sockAddr6,
-                                        sizeof(struct sockaddr_in6), 0);
-                        if (rc >= 0)
-                                connected = true;
-                }
-        }
-        if (!connected) {
-                server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
-                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                &server->addr.sockAddr6,
-                                sizeof(struct sockaddr_in6), 0);
                if (rc >= 0)
-                        connected = true;
+                        return rc;
-        }
-        /* give up here - unless we want to retry on different
-                protocol families some day */
-        if (!connected) {
-                if (orig_port)
-                        server->addr.sockAddr6.sin6_port = orig_port;
-                cFYI(1, "Error %d connecting to server via ipv6", rc);
-                sock_release(socket);
-                server->ssocket = NULL;
-                return rc;
-        }
-        /*
-         * Eventually check for other socket options to change from
-         * the default. sock_setsockopt not used because it expects
-         * user space buffer
-         */
-        socket->sk->sk_rcvtimeo = 7 * HZ;
-        socket->sk->sk_sndtimeo = 5 * HZ;
-        if (server->tcp_nodelay) {
+                /* if it failed, try with 139 port */
-                val = 1;
+                *sport = htons(RFC1001_PORT);
-                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
-                                (char *)&val, sizeof(val));
-                if (rc)
-                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
-        server->ssocket = socket;
+        return generic_ip_connect(server);
-        return rc;
 }
 void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index db2a58c00f7b..2e773825835e 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -293,10 +293,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
                }
-                CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
-                                        cifs_sb->local_nls,
+                                        current->tgid);
-                                        cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else {
                /* BB implement mode setting via Windows security
                   descriptors e.g. */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5a28660ca2b5..d843631c028d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -104,53 +104,6 @@ static inline int cifs_get_disposition(unsigned int flags)
                return FILE_OPEN;
 }
-static inline int cifs_open_inode_helper(struct inode *inode,
-        struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
-        char *full_path, int xid)
-{
-        struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
-        struct timespec temp;
-        int rc;
-        if (pCifsInode->clientCanCacheRead) {
-                /* we have the inode open somewhere else
-                   no need to discard cache data */
-                goto client_can_cache;
-        }
-        /* BB need same check in cifs_create too? */
-        /* if not oplocked, invalidate inode pages if mtime or file
-           size changed */
-        temp = cifs_NTtimeToUnix(buf->LastWriteTime);
-        if (timespec_equal(&inode->i_mtime, &temp) &&
-                           (inode->i_size ==
-                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, "inode unchanged on server");
-        } else {
-                if (inode->i_mapping) {
-                        /* BB no need to lock inode until after invalidate
-                        since namei code should already have it locked? */
-                        rc = filemap_write_and_wait(inode->i_mapping);
-                        mapping_set_error(inode->i_mapping, rc);
-                }
-                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed");
-                invalidate_remote_inode(inode);
-        }
-client_can_cache:
-        if (pTcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
-                                              xid);
-        else
-                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
-                                         xid, NULL);
-        cifs_set_oplock_level(pCifsInode, oplock);
-        return rc;
-}
 int cifs_posix_open(char *full_path, struct inode **pinode,
                        struct super_block *sb, int mode, unsigned int f_flags,
                        __u32 *poplock, __u16 *pnetfid, int xid)
@@ -213,6 +166,76 @@ posix_open_ret:
        return rc;
 }
+static int
+cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+             struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
+             __u16 *pnetfid, int xid)
+{
+        int rc;
+        int desiredAccess;
+        int disposition;
+        FILE_ALL_INFO *buf;
+        desiredAccess = cifs_convert_flags(f_flags);
+/*********************************************************************
+ *  open flag mapping table:
+ *
+ *      POSIX Flag            CIFS Disposition
+ *      ----------            ----------------
+ *      O_CREAT               FILE_OPEN_IF
+ *      O_CREAT | O_EXCL      FILE_CREATE
+ *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
+ *      O_TRUNC               FILE_OVERWRITE
+ *      none of the above     FILE_OPEN
+ *
+ *      Note that there is not a direct match between disposition
+ *      FILE_SUPERSEDE (ie create whether or not file exists although
+ *      O_CREAT | O_TRUNC is similar but truncates the existing
+ *      file rather than creating a new file as FILE_SUPERSEDE does
+ *      (which uses the attributes / metadata passed in on open call)
+ *?
+ *?  O_SYNC is a reasonable match to CIFS writethrough flag
+ *?  and the read write flags match reasonably.  O_LARGEFILE
+ *?  is irrelevant because largefile support is always used
+ *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
+ *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
+ *********************************************************************/
+        disposition = cifs_get_disposition(f_flags);
+        /* BB pass O_SYNC flag through on file attributes .. BB */
+        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        if (tcon->ses->capabilities & CAP_NT_SMBS)
+                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
+                         desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        else
+                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
+                        desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto out;
+        if (tcon->unix_ext)
+                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+                                              xid);
+        else
+                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
+                                         xid, pnetfid);
+out:
+        kfree(buf);
+        return rc;
+}
 struct cifsFileInfo *
 cifs_new_fileinfo(__u16 fileHandle, struct file *file,
                  struct tcon_link *tlink, __u32 oplock)
@@ -317,10 +340,8 @@ int cifs_open(struct inode *inode, struct file *file)
        struct cifsFileInfo *pCifsFile = NULL;
        struct cifsInodeInfo *pCifsInode;
        char *full_path = NULL;
-        int desiredAccess;
+        bool posix_open_ok = false;
-        int disposition;
        __u16 netfid;
-        FILE_ALL_INFO *buf = NULL;
        xid = GetXid();
@@ -358,17 +379,7 @@ int cifs_open(struct inode *inode, struct file *file)
                                file->f_flags, &oplock, &netfid, xid);
                if (rc == 0) {
                        cFYI(1, "posix open succeeded");
+                        posix_open_ok = true;
-                        pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
-                                                      oplock);
-                        if (pCifsFile == NULL) {
-                                CIFSSMBClose(xid, tcon, netfid);
-                                rc = -ENOMEM;
-                        }
-                        cifs_fscache_set_inode_cookie(inode, file);
-                        goto out;
                } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        if (tcon->ses->serverNOS)
                                cERROR(1, "server %s of type %s returned"
@@ -385,103 +396,39 @@ int cifs_open(struct inode *inode, struct file *file)
                   or DFS errors */
        }
-        desiredAccess = cifs_convert_flags(file->f_flags);
+        if (!posix_open_ok) {
+                rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
-/*********************************************************************
+                                  file->f_flags, &oplock, &netfid, xid);
- *  open flag mapping table:
+                if (rc)
- *
+                        goto out;
- *      POSIX Flag            CIFS Disposition
- *      ----------            ----------------
- *      O_CREAT               FILE_OPEN_IF
- *      O_CREAT | O_EXCL      FILE_CREATE
- *      O_CREAT | O_TRUNC     FILE_OVERWRITE_IF
- *      O_TRUNC               FILE_OVERWRITE
- *      none of the above     FILE_OPEN
- *
- *      Note that there is not a direct match between disposition
- *      FILE_SUPERSEDE (ie create whether or not file exists although
- *      O_CREAT | O_TRUNC is similar but truncates the existing
- *      file rather than creating a new file as FILE_SUPERSEDE does
- *      (which uses the attributes / metadata passed in on open call)
- *?
- *?  O_SYNC is a reasonable match to CIFS writethrough flag
- *?  and the read write flags match reasonably.  O_LARGEFILE
- *?  is irrelevant because largefile support is always used
- *?  by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
- *       O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
- *********************************************************************/
-        disposition = cifs_get_disposition(file->f_flags);
-        /* BB pass O_SYNC flag through on file attributes .. BB */
-        /* Also refresh inode by passing in file_info buf returned by SMBOpen
-           and calling get_inode_info with returned buf (at least helps
-           non-Unix server case) */
-        /* BB we can not do this if this is the second open of a file
-           and the first handle has writebehind data, we might be
-           able to simply do a filemap_fdatawrite/filemap_fdatawait first */
-        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-        if (!buf) {
-                rc = -ENOMEM;
-                goto out;
-        }
-        if (tcon->ses->capabilities & CAP_NT_SMBS)
-                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-                         desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        else
-                rc = -EIO; /* no NT SMB support fall into legacy open below */
-        if (rc == -EIO) {
-                /* Old server, try legacy style OpenX */
-                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
-                        desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
-                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        }
-        if (rc) {
-                cFYI(1, "cifs_open returned 0x%x", rc);
-                goto out;
        }
-        rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
-        if (rc != 0)
-                goto out;
        pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
        if (pCifsFile == NULL) {
+                CIFSSMBClose(xid, tcon, netfid);
                rc = -ENOMEM;
                goto out;
        }
        cifs_fscache_set_inode_cookie(inode, file);
-        if (oplock & CIFS_CREATE_ACTION) {
+        if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
                /* time to set mode which we can not set earlier due to
                   problems creating new read-only files */
-                if (tcon->unix_ext) {
+                struct cifs_unix_set_info_args args = {
-                        struct cifs_unix_set_info_args args = {
+                        .mode   = inode->i_mode,
-                                .mode   = inode->i_mode,
+                        .uid    = NO_CHANGE_64,
-                                .uid    = NO_CHANGE_64,
+                        .gid    = NO_CHANGE_64,
-                                .gid    = NO_CHANGE_64,
+                        .ctime  = NO_CHANGE_64,
-                                .ctime  = NO_CHANGE_64,
+                        .atime  = NO_CHANGE_64,
-                                .atime  = NO_CHANGE_64,
+                        .mtime  = NO_CHANGE_64,
-                                .mtime  = NO_CHANGE_64,
+                        .device = 0,
-                                .device = 0,
+                };
-                        };
+                CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
-                        CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                                        pCifsFile->pid);
-                                               cifs_sb->local_nls,
-                                               cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                }
        }
 out:
-        kfree(buf);
        kfree(full_path);
        FreeXid(xid);
        cifs_put_tlink(tlink);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a853a89857a5..0c7e36910e31 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -518,6 +518,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
@@ -779,6 +780,10 @@ cifs_find_inode(struct inode *inode, void *opaque)
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
+        /* use createtime like an i_generation field */
+        if (CIFS_I(inode)->createtime != fattr->cf_createtime)
+                return 0;
        /* don't match inode of different type */
        if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
                return 0;
@@ -796,6 +801,7 @@ cifs_init_inode(struct inode *inode, void *opaque)
        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
        CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
+        CIFS_I(inode)->createtime = fattr->cf_createtime;
        return 0;
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ec5b68e3b928..76b1b37c9e6b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -160,6 +160,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
        fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
        fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+        fattr->cf_createtime = le64_to_cpu(info->CreationTime);
        fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
        fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7b01d3f6eed6..eb746486e49e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -420,7 +420,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        return 0;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 /* BB Move to ntlmssp.c eventually */
 /* We do not malloc the blob, it is passed in pbuffer, because
@@ -431,13 +430,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
        __u32 flags;
+        memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
        sec_blob->MessageType = NtLmNegotiate;
        /* BB is NTLMV2 session security format easier to use here? */
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -446,7 +446,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
                                NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        }
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->WorkstationName.BufferOffset = 0;
        sec_blob->WorkstationName.Length = 0;
@@ -477,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        flags = NTLMSSP_NEGOTIATE_56 |
                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
        if (ses->server->secMode &
           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -485,7 +485,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
        tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
-        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+        sec_blob->NegotiateFlags = cpu_to_le32(flags);
        sec_blob->LmChallengeResponse.BufferOffset =
                                cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
@@ -544,8 +544,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->WorkstationName.MaximumLength = 0;
        tmp += 2;
-        if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+        if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
-                        !calc_seckey(ses)) {
+                (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
+                        && !calc_seckey(ses)) {
                memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
@@ -563,17 +564,6 @@ setup_ntlmv2_ret:
        return rc;
 }
-static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
-                                 struct cifsSesInfo *ses)
-{
-        build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
-        return;
-}
-#endif
 int
 CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
               const struct nls_table *nls_cp)
@@ -814,71 +804,70 @@ ssetup_ntlmssp_authenticate:
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
-        } else {
+        } else if (type == RawNTLMSSP) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-                if (type == RawNTLMSSP) {
+                        cERROR(1, "NTLMSSP requires Unicode support");
-                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+                        rc = -ENOSYS;
-                                cERROR(1, "NTLMSSP requires Unicode support");
+                        goto ssetup_exit;
-                                rc = -ENOSYS;
+                }
+                cFYI(1, "ntlmssp session setup phase %d", phase);
+                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                capabilities |= CAP_EXTENDED_SECURITY;
+                pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+                switch(phase) {
+                case NtLmNegotiate:
+                        build_ntlmssp_negotiate_blob(
+                                pSMB->req.SecurityBlob, ses);
+                        iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+                        iov[1].iov_base = pSMB->req.SecurityBlob;
+                        pSMB->req.SecurityBlobLength =
+                                cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+                        break;
+                case NtLmAuthenticate:
+                        /*
+                         * 5 is an empirical value, large enough to hold
+                         * authenticate message plus max 10 of av paris,
+                         * domain, user, workstation names, flags, etc.
+                         */
+                        ntlmsspblob = kzalloc(
+                                5*sizeof(struct _AUTHENTICATE_MESSAGE),
+                                GFP_KERNEL);
+                        if (!ntlmsspblob) {
+                                cERROR(1, "Can't allocate NTLMSSP blob");
+                                rc = -ENOMEM;
                                goto ssetup_exit;
                        }
-                        cFYI(1, "ntlmssp session setup phase %d", phase);
+                        rc = build_ntlmssp_auth_blob(ntlmsspblob,
-                        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                                                &blob_len, ses, nls_cp);
-                        capabilities |= CAP_EXTENDED_SECURITY;
+                        if (rc)
-                        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
-                        if (phase == NtLmNegotiate) {
-                                setup_ntlmssp_neg_req(pSMB, ses);
-                                iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
-                                iov[1].iov_base = &pSMB->req.SecurityBlob[0];
-                        } else if (phase == NtLmAuthenticate) {
-                                /* 5 is an empirical value, large enought to
-                                 * hold authenticate message, max 10 of
-                                 * av paris, doamin,user,workstation mames,
-                                 * flags etc..
-                                 */
-                                ntlmsspblob = kmalloc(
-                                        5*sizeof(struct _AUTHENTICATE_MESSAGE),
-                                        GFP_KERNEL);
-                                if (!ntlmsspblob) {
-                                        cERROR(1, "Can't allocate NTLMSSP");
-                                        rc = -ENOMEM;
-                                        goto ssetup_exit;
-                                }
-                                rc = build_ntlmssp_auth_blob(ntlmsspblob,
-                                                        &blob_len, ses, nls_cp);
-                                if (rc)
-                                        goto ssetup_exit;
-                                iov[1].iov_len = blob_len;
-                                iov[1].iov_base = ntlmsspblob;
-                                pSMB->req.SecurityBlobLength =
-                                        cpu_to_le16(blob_len);
-                                /* Make sure that we tell the server that we
-                                   are using the uid that it just gave us back
-                                   on the response (challenge) */
-                                smb_buf->Uid = ses->Suid;
-                        } else {
-                                cERROR(1, "invalid phase %d", phase);
-                                rc = -ENOSYS;
                                goto ssetup_exit;
-                        }
+                        iov[1].iov_len = blob_len;
-                        /* unicode strings must be word aligned */
+                        iov[1].iov_base = ntlmsspblob;
-                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
-                                *bcc_ptr = 0;
+                        /*
-                                bcc_ptr++;
+                         * Make sure that we tell the server that we are using
-                        }
+                         * the uid that it just gave us back on the response
-                        unicode_oslm_strings(&bcc_ptr, nls_cp);
+                         * (challenge)
-                } else {
+                         */
-                        cERROR(1, "secType %d not supported!", type);
+                        smb_buf->Uid = ses->Suid;
+                        break;
+                default:
+                        cERROR(1, "invalid phase %d", phase);
                        rc = -ENOSYS;
                        goto ssetup_exit;
                }
-#else
+                /* unicode strings must be word aligned */
+                if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                        *bcc_ptr = 0;
+                        bcc_ptr++;
+                }
+                unicode_oslm_strings(&bcc_ptr, nls_cp);
+        } else {
                cERROR(1, "secType %d not supported!", type);
                rc = -ENOSYS;
                goto ssetup_exit;
-#endif
        }
        iov[2].iov_base = str_area;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e0588cdf4cc5..59ca81b16919 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,7 +119,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        if (ssocket == NULL)
                return -ENOTSOCK; /* BB eventually add reconnect code here */
-        smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
+        smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
        smb_msg.msg_namelen = sizeof(struct sockaddr);
        smb_msg.msg_control = NULL;
        smb_msg.msg_controllen = 0;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622a..9c64ae9e4c1a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
 #define NEEDED_RMEM (4*1024*1024)
 #define CONN_HASH_SIZE 32
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
 struct cbuf {
        unsigned int base;
        unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
+#define CF_APP_LIMITED 7
        struct list_head writequeue;  /* List of outgoing writequeue_entries */
        spinlock_t writequeue_lock;
        int (*rx_action) (struct connection *); /* What to do when active */
@@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk)
 {
        struct connection *con = sock2con(sk);
-        if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+        if (!con)
+                return;
+        clear_bit(SOCK_NOSPACE, &con->sock->flags);
+        if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+                con->sock->sk->sk_write_pending--;
+                clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+        }
+        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
                queue_work(send_workqueue, &con->swork);
 }
@@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con)
        struct sockaddr_storage saddr, src_addr;
        int addr_len;
        struct socket *sock = NULL;
+        int one = 1;
        if (con->nodeid == 0) {
                log_print("attempt to connect sock 0 foiled");
@@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con)
        make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
        log_print("connecting to %d", con->nodeid);
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result =
                sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
                                   O_NONBLOCK);
@@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
                goto create_out;
        }
+        /* Turn off Nagle's algorithm */
+        kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+                          sizeof(one));
        result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
                                   (char *)&one, sizeof(one));
@@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con)
        const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
        struct writequeue_entry *e;
        int len, offset;
+        int count = 0;
        mutex_lock(&con->sock_mutex);
        if (con->sock == NULL)
@@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con)
                        ret = kernel_sendpage(con->sock, e->page, offset, len,
                                              msg_flags);
                        if (ret == -EAGAIN || ret == 0) {
+                                if (ret == -EAGAIN &&
+                                    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+                                    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+                                        /* Notify TCP that we're limited by the
+                                         * application window size.
+                                         */
+                                        set_bit(SOCK_NOSPACE, &con->sock->flags);
+                                        con->sock->sk->sk_write_pending++;
+                                }
                                cond_resched();
                                goto out;
                        }
                        if (ret <= 0)
                                goto send_error;
                }
-                        /* Don't starve people filling buffers */
+                /* Don't starve people filling buffers */
+                if (++count >= MAX_SEND_MSG_COUNT) {
                        cond_resched();
+                        count = 0;
+                }
                spin_lock(&con->writequeue_lock);
                e->offset += ret;
@@ -1430,20 +1468,19 @@ static void work_stop(void)
 static int work_start(void)
 {
-        int error;
+        recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
-        recv_workqueue = create_workqueue("dlm_recv");
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        error = IS_ERR(recv_workqueue);
+        if (!recv_workqueue) {
-        if (error) {
+                log_print("can't start dlm_recv");
-                log_print("can't start dlm_recv %d", error);
+                return -ENOMEM;
-                return error;
        }
-        send_workqueue = create_singlethread_workqueue("dlm_send");
+        send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
-        error = IS_ERR(send_workqueue);
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-        if (error) {
+        if (!send_workqueue) {
-                log_print("can't start dlm_send %d", error);
+                log_print("can't start dlm_send");
                destroy_workqueue(recv_workqueue);
-                return error;
+                return -ENOMEM;
        }
        return 0;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2709b34206ab..47cda410b548 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,21 +28,30 @@
 typedef struct ext2_dir_entry_2 ext2_dirent;
+/*
+ * Tests against MAX_REC_LEN etc were put in place for 64k block
+ * sizes; if that is not possible on this arch, we can skip
+ * those tests and speed things up.
+ */
 static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
 {
        unsigned len = le16_to_cpu(dlen);
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == EXT2_MAX_REC_LEN)
                return 1 << 16;
+#endif
        return len;
 }
 static inline __le16 ext2_rec_len_to_disk(unsigned len)
 {
+#if (PAGE_CACHE_SIZE >= 65536)
        if (len == (1 << 16))
                return cpu_to_le16(EXT2_MAX_REC_LEN);
        else
                BUG_ON(len > (1 << 16));
+#endif
        return cpu_to_le16(len);
 }
@@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet)
                p = (ext2_dirent *)(kaddr + offs);
                rec_len = ext2_rec_len_from_disk(p->rec_len);
-                if (rec_len < EXT2_DIR_REC_LEN(1))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
                        goto Eshort;
-                if (rec_len & 3)
+                if (unlikely(rec_len & 3))
                        goto Ealign;
-                if (rec_len < EXT2_DIR_REC_LEN(p->name_len))
+                if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
                        goto Enamelen;
-                if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
                        goto Espan;
-                if (le32_to_cpu(p->inode) > max_inumber)
+                if (unlikely(le32_to_cpu(p->inode) > max_inumber))
                        goto Einumber;
        }
        if (offs != limit)
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index f8aecd2e3297..2e1d8341d827 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
        inode = NULL;
        if (ino) {
                inode = ext2_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext2_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e0c6380ff992..7731695e65d9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
-void ext2_error (struct super_block * sb, const char * function,
+void ext2_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = sbi->s_es;
@@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function,
        }
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function,
 void ext2_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f84700be3274..c2e4dce984d2 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -199,14 +199,6 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
                        goto found;
                entry = next;
        }
-        /* Check the remaining name entries */
-        while (!IS_LAST_ENTRY(entry)) {
-                struct ext2_xattr_entry *next =
-                        EXT2_XATTR_NEXT(entry);
-                if ((char *)next >= end)
-                        goto bad_block;
-                entry = next;
-        }
        if (ext2_xattr_cache_insert(bh))
                ea_idebug(inode, "cache insert failed");
        error = -ENODATA;
@@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
 /*
 * ext2_xattr_set()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b3db22649426..045995c8ce5a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
 #include <linux/ext3_jbd.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 /*
 * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+static void ext3_get_group_no_and_offset(struct super_block *sb,
+        ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
+{
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+        if (offsetp)
+                *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
+        if (blockgrpp)
+                *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
+}
 /**
 * ext3_get_group_desc() -- load group descriptor from disk
 * @sb:                 super block
@@ -1885,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
        return ext3_bg_num_gdb_meta(sb,group);
 }
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                 super block for file system
+ * @group:              allocation group to trim
+ * @start:              first group block to examine
+ * @max:                last group block to examine
+ * @gdp:                allocation group description structure
+ * @minblocks:          minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
+                                ext3_grpblk_t start, ext3_grpblk_t max,
+                                ext3_grpblk_t minblocks)
+{
+        handle_t *handle;
+        ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
+        ext3_fsblk_t discard_block;
+        struct ext3_sb_info *sbi;
+        struct buffer_head *gdp_bh, *bitmap_bh = NULL;
+        struct ext3_group_desc *gdp;
+        int err = 0, ret = 0;
+        /*
+         * We will update one block bitmap, and one group descriptor
+         */
+        handle = ext3_journal_start_sb(sb, 2);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        bitmap_bh = read_block_bitmap(sb, group);
+        if (!bitmap_bh) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(bitmap_bh, "getting undo access");
+        err = ext3_journal_get_undo_access(handle, bitmap_bh);
+        if (err)
+                goto err_out;
+        gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+        if (!gdp) {
+                err = -EIO;
+                goto err_out;
+        }
+        BUFFER_TRACE(gdp_bh, "get_write_access");
+        err = ext3_journal_get_write_access(handle, gdp_bh);
+        if (err)
+                goto err_out;
+        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+        sbi = EXT3_SB(sb);
+         /* Walk through the whole group */
+        while (start < max) {
+                start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+                if (start < 0)
+                        break;
+                next = start;
+                /*
+                 * Allocate contiguous free extents by setting bits in the
+                 * block bitmap
+                 */
+                while (next < max
+                        && claim_block(sb_bgl_lock(sbi, group),
+                                        next, bitmap_bh)) {
+                        next++;
+                }
+                 /* We did not claim any blocks */
+                if (next == start)
+                        continue;
+                discard_block = (ext3_fsblk_t)start +
+                                ext3_group_first_block_no(sb, group);
+                /* Update counters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+                /* Do not issue a TRIM on extents smaller than minblocks */
+                if ((next - start) < minblocks)
+                        goto free_extent;
+                 /* Send the TRIM command down to the device */
+                err = sb_issue_discard(sb, discard_block, next - start,
+                                       GFP_NOFS, 0);
+                count += (next - start);
+free_extent:
+                freed = 0;
+                /*
+                 * Clear bits in the bitmap
+                 */
+                for (bit = start; bit < next; bit++) {
+                        BUFFER_TRACE(bitmap_bh, "clear bit");
+                        if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+                                                bit, bitmap_bh->b_data)) {
+                                ext3_error(sb, __func__,
+                                        "bit already cleared for block "E3FSBLK,
+                                         (unsigned long)bit);
+                                BUFFER_TRACE(bitmap_bh, "bit already cleared");
+                        } else {
+                                freed++;
+                        }
+                }
+                /* Update couters */
+                spin_lock(sb_bgl_lock(sbi, group));
+                le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+                spin_unlock(sb_bgl_lock(sbi, group));
+                percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+                start = next;
+                if (err < 0) {
+                        if (err != -EOPNOTSUPP)
+                                ext3_warning(sb, __func__, "Discard command "
+                                             "returned error %d\n", err);
+                        break;
+                }
+                if (fatal_signal_pending(current)) {
+                        err = -ERESTARTSYS;
+                        break;
+                }
+                cond_resched();
+                /* No more suitable extents */
+                if ((free_blocks - count) < minblocks)
+                        break;
+        }
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
+        if (!err)
+                err = ret;
+        /* And the group descriptor block */
+        BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+        ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+        if (!err)
+                err = ret;
+        ext3_debug("trimmed %d blocks in the group %d\n",
+                count, group);
+err_out:
+        if (err)
+                count = err;
+        ext3_journal_stop(handle);
+        brelse(bitmap_bh);
+        return count;
+}
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @sb:                 superblock for filesystem
+ * @start:              First Byte to trim
+ * @len:                number of Bytes to trim from start
+ * @minlen:             minimum extent length in Bytes
+ *
+ * ext3_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext3_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        ext3_grpblk_t last_block, first_block, free_blocks;
+        unsigned long first_group, last_group;
+        unsigned long group, ngroups;
+        struct ext3_group_desc *gdp;
+        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+        uint64_t start, len, minlen, trimmed;
+        ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
+        int ret = 0;
+        start = range->start >> sb->s_blocksize_bits;
+        len = range->len >> sb->s_blocksize_bits;
+        minlen = range->minlen >> sb->s_blocksize_bits;
+        trimmed = 0;
+        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
+                return -EINVAL;
+        if (start >= max_blks)
+                goto out;
+        if (start < le32_to_cpu(es->s_first_data_block)) {
+                len -= le32_to_cpu(es->s_first_data_block) - start;
+                start = le32_to_cpu(es->s_first_data_block);
+        }
+        if (start + len > max_blks)
+                len = max_blks - start;
+        ngroups = EXT3_SB(sb)->s_groups_count;
+        smp_rmb();
+        /* Determine first and last group to examine based on start and len */
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
+                                     &first_group, &first_block);
+        ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
+                                     &last_group, &last_block);
+        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+        last_block = EXT3_BLOCKS_PER_GROUP(sb);
+        if (first_group > last_group)
+                return -EINVAL;
+        for (group = first_group; group <= last_group; group++) {
+                gdp = ext3_get_group_desc(sb, group, NULL);
+                if (!gdp)
+                        break;
+                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+                if (free_blocks < minlen)
+                        continue;
+                if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+                        len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+                else
+                        last_block = first_block + len;
+                ret = ext3_trim_all_free(sb, group, first_block,
+                                        last_block, minlen);
+                if (ret < 0)
+                        break;
+                trimmed += ret;
+                first_block = 0;
+        }
+        if (ret >= 0)
+                ret = 0;
+out:
+        range->len = trimmed * sb->s_blocksize;
+        return ret;
+}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf6..34f0a072b935 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
        const char * error_msg = NULL;
        const int rlen = ext3_rec_len_from_disk(de->rec_len);
-        if (rlen < EXT3_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
-        if (error_msg != NULL)
+        if (unlikely(error_msg != NULL))
                ext3_error (dir->i_sb, function,
                        "bad entry in directory #%lu: %s - "
                        "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
                        dir->i_ino, error_msg, offset,
                        (unsigned long) le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a9580617edd2..ae94f6d949f5 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2145,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        if (ext3_journal_dirty_metadata(handle, bh))
+                                return;
                }
                ext3_mark_inode_dirty(handle, inode);
                truncate_restart_transaction(handle, inode);
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
-                        ext3_journal_get_write_access(handle, bh);
+                        if (ext3_journal_get_write_access(handle, bh))
+                                return;
                }
        }
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783a..fc080dd561f7 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -276,7 +276,29 @@ group_add_out:
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
+        case FITRIM: {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                                   sizeof(range)))
+                        return -EFAULT;
+                ret = ext3_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                                 sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index bce9dce639b8..b27ba71810ec 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        struct buffer_head * bh_use[NAMEI_RA_SIZE];
        struct buffer_head * bh, *ret = NULL;
        unsigned long start, block, b;
+        const u8 *name = entry->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
        namelen = entry->len;
        if (namelen > EXT3_NAME_LEN)
                return NULL;
+        if ((namelen <= 2) && (name[0] == '.') &&
+            (name[1] == '.' || name[1] == 0)) {
+                /*
+                 * "." or ".." will only be in the first block
+                 * NFS may look up ".."; "." should be handled by the VFS
+                 */
+                block = start = 0;
+                nblocks = 1;
+                goto restart;
+        }
        if (is_dx(dir)) {
                bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
                /*
@@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
                        struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
                        int *err)
 {
-        struct super_block * sb;
+        struct super_block *sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-        u32 hash;
        struct dx_frame frames[2], *frame;
-        struct ext3_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        unsigned long block;
        int retval;
-        int namelen = entry->len;
-        const u8 *name = entry->name;
-        sb = dir->i_sb;
+        if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-        /* NFS may look up ".." - look at dx_root directory block */
+                return NULL;
-        if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
-                if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
-                        return NULL;
-        } else {
-                frame = frames;
-                frame->bh = NULL;                       /* for dx_release() */
-                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-        }
-        hash = hinfo.hash;
        do {
                block = dx_get_block(frame->at);
                if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
                        goto errout;
-                de = (struct ext3_dir_entry_2 *) bh->b_data;
-                top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                       EXT3_DIR_REC_LEN(0));
-                for (; de < top; de = ext3_next_entry(de)) {
-                        int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
-                                  + ((char *) de - bh->b_data);
-                        if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
-                                brelse(bh);
-                                *err = ERR_BAD_DX_DIR;
-                                goto errout;
-                        }
-                        if (ext3_match(namelen, name, de)) {
+                retval = search_dirblock(bh, dir, entry,
-                                *res_dir = de;
+                                         block << EXT3_BLOCK_SIZE_BITS(sb),
-                                dx_release(frames);
+                                         res_dir);
-                                return bh;
+                if (retval == 1) {
-                        }
+                        dx_release(frames);
+                        return bh;
                }
-                brelse (bh);
+                brelse(bh);
+                if (retval == -1) {
+                        *err = ERR_BAD_DX_DIR;
+                        goto errout;
+                }
                /* Check to see if we should continue to search */
-                retval = ext3_htree_next_block(dir, hash, frame,
+                retval = ext3_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext3_warning(sb, __func__,
@@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
                        return ERR_PTR(-EIO);
                }
                inode = ext3_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                ext3_error(dir->i_sb, __func__,
                                                "deleted inode referenced: %lu",
@@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext3_journal_dirty_metadata(handle, frames[0].bh);
+                err = ext3_journal_dirty_metadata(handle, frames[0].bh);
+                if (err)
+                        goto journal_error;
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle,
                if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
+                        int err;
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext3_journal_get_write_access(handle, bh);
+                        err = ext3_journal_get_write_access(handle, bh);
+                        if (err)
+                                goto journal_error;
                        if (pde)
                                pde->rec_len = ext3_rec_len_to_disk(
                                        ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                        ext3_journal_dirty_metadata(handle, bh);
+                        err = ext3_journal_dirty_metadata(handle, bh);
+                        if (err) {
+journal_error:
+                                ext3_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext3_rec_len_from_disk(de->rec_len);
@@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 {
        handle_t *handle;
        struct inode * inode;
-        struct buffer_head * dir_block;
+        struct buffer_head * dir_block = NULL;
        struct ext3_dir_entry_2 * de;
        int err, retries = 0;
@@ -1790,15 +1793,14 @@ retry:
        inode->i_fop = &ext3_dir_operations;
        inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
-        if (!dir_block) {
+        if (!dir_block)
-                drop_nlink(inode); /* is this nlink == 0? */
+                goto out_clear_inode;
-                unlock_new_inode(inode);
-                ext3_mark_inode_dirty(handle, inode);
-                iput (inode);
-                goto out_stop;
-        }
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext3_journal_get_write_access(handle, dir_block);
+        err = ext3_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext3_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1814,11 +1816,16 @@ retry:
        ext3_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
-        ext3_journal_dirty_metadata(handle, dir_block);
+        err = ext3_journal_dirty_metadata(handle, dir_block);
-        brelse (dir_block);
+        if (err)
-        ext3_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext3_add_entry (handle, dentry, inode);
+        err = ext3_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext3_add_entry (handle, dentry, inode);
        if (err) {
+out_clear_inode:
                inode->i_nlink = 0;
                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
@@ -1827,10 +1834,14 @@ retry:
        }
        inc_nlink(dir);
        ext3_update_dx_flag(dir);
-        ext3_mark_inode_dirty(handle, dir);
+        err = ext3_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                        goto end_rename;
        } else {
                BUFFER_TRACE(new_bh, "get write access");
-                ext3_journal_get_write_access(handle, new_bh);
+                retval = ext3_journal_get_write_access(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                new_de->inode = cpu_to_le32(old_inode->i_ino);
                if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
                                              EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
                ext3_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, new_bh);
+                retval = ext3_journal_dirty_metadata(handle, new_bh);
+                if (retval)
+                        goto journal_error;
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
        ext3_update_dx_flag(old_dir);
        if (dir_bh) {
                BUFFER_TRACE(dir_bh, "get_write_access");
-                ext3_journal_get_write_access(handle, dir_bh);
+                retval = ext3_journal_get_write_access(handle, dir_bh);
+                if (retval)
+                        goto journal_error;
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, dir_bh);
+                retval = ext3_journal_dirty_metadata(handle, dir_bh);
+                if (retval) {
+journal_error:
+                        ext3_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                drop_nlink(old_dir);
                if (new_inode) {
                        drop_nlink(new_inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index e746d30b1232..108b142e11ed 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(gdb);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, gdb);
+                err = ext3_journal_dirty_metadata(handle, gdb);
+                if (err) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext3_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(it);
                        goto exit_bh;
                }
-                ext3_journal_dirty_metadata(handle, it);
+                err = ext3_journal_dirty_metadata(handle, it);
+                if (err) {
+                        brelse(it);
+                        goto exit_bh;
+                }
                brelse(it);
                ext3_set_bit(bit, bh->b_data);
        }
@@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
+        if (err)
+                goto exit_bh;
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
@@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
                        bh->b_data);
-        ext3_journal_dirty_metadata(handle, bh);
+        err = ext3_journal_dirty_metadata(handle, bh);
 exit_bh:
        brelse(bh);
@@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
-        ext3_journal_dirty_metadata(handle, dind);
+        err = ext3_journal_dirty_metadata(handle, dind);
+        if (err)
+                goto exit_group_desc;
        brelse(dind);
+        dind = NULL;
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
-        ext3_mark_iloc_dirty(handle, inode, &iloc);
+        err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+        if (err)
+                goto exit_group_desc;
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext3_journal_dirty_metadata(handle, *primary);
+        err = ext3_journal_dirty_metadata(handle, *primary);
+        if (err)
+                goto exit_group_desc;
        o_group_desc = EXT3_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        if (err)
+                goto exit_inode;
        return 0;
+exit_group_desc:
+        kfree(n_group_desc);
 exit_inode:
        //ext3_journal_release_buffer(handle, iloc.bh);
        brelse(iloc.bh);
@@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb,
                }
                ext3_debug("update metadata backup %#04lx\n",
                          (unsigned long)bh->b_blocknr);
-                if ((err = ext3_journal_get_write_access(handle, bh)))
+                if ((err = ext3_journal_get_write_access(handle, bh))) {
+                        brelse(bh);
                        break;
+                }
                lock_buffer(bh);
                memcpy(bh->b_data, data, size);
                if (rest)
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext3_journal_dirty_metadata(handle, bh);
+                err = ext3_journal_dirty_metadata(handle, bh);
                brelse(bh);
+                if (err)
+                        break;
        }
        if ((err2 = ext3_journal_stop(handle)) && !err)
                err = err2;
@@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext3_journal_dirty_metadata(handle, primary);
+        err = ext3_journal_dirty_metadata(handle, primary);
+        if (err)
+                goto exit_journal;
        /* Update the reserved block counts only once the new group is
         * active. */
@@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT3_INODES_PER_GROUP(sb));
-        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 exit_journal:
        mutex_unlock(&sbi->s_resize_lock);
@@ -1064,8 +1095,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                goto exit_put;
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
-        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
+        if (err) {
+                ext3_warning(sb, __func__,
+                             "error %d on journal dirty metadata", err);
+                ext3_journal_stop(handle);
+                goto exit_put;
+        }
        ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
                   o_blocks_count, o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 77ce1616f725..b7d0554631e4 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -143,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
 void ext3_msg(struct super_block *sb, const char *prefix,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
@@ -195,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb)
                        sb->s_id);
 }
-void ext3_error (struct super_block * sb, const char * function,
+void ext3_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        ext3_handle_error(sb);
@@ -274,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
 * case we take the easy way out and panic immediately.
 */
-void ext3_abort (struct super_block * sb, const char * function,
+void ext3_abort(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                 const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
@@ -300,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function,
                journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
-void ext3_warning (struct super_block * sb, const char * function,
+void ext3_warning(struct super_block *sb, const char *function,
-                   const char * fmt, ...)
+                  const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
@@ -1848,13 +1866,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        if (generic_check_addressable(sb->s_blocksize_bits,
+        err = generic_check_addressable(sb->s_blocksize_bits,
-                                      le32_to_cpu(es->s_blocks_count))) {
+                                        le32_to_cpu(es->s_blocks_count));
+        if (err) {
                ext3_msg(sb, KERN_ERR,
                        "error: filesystem is too large to mount safely");
                if (sizeof(sector_t) < 8)
                        ext3_msg(sb, KERN_ERR,
                                "error: CONFIG_LBDAF not enabled");
+                ret = err;
                goto failed_mount;
        }
@@ -2297,7 +2317,7 @@ static int ext3_load_journal(struct super_block *sb,
        EXT3_SB(sb)->s_journal = journal;
        ext3_clear_journal_err(sb, es);
-        if (journal_devnum &&
+        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa89..32e6cc23bd9a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext3_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 14c3af26c671..adf96b822781 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -592,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
-        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
+        if (!(*errp) &&
+            ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ece76fb6a40c..164c56092e58 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
        return (ext4_filetype_table[filetype]);
 }
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
-                           struct inode *dir,
+                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
                           struct buffer_head *bh,
                           unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
-        if (rlen < EXT4_DIR_REC_LEN(1))
+        if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely(((char *) de - bh->b_data) + rlen >
+                          dir->i_sb->s_blocksize))
                error_msg = "directory entry across blocks";
-        else if (le32_to_cpu(de->inode) >
+        else if (unlikely(le32_to_cpu(de->inode) >
-                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
+        else
+                return 0;
-        if (error_msg != NULL)
+        if (filp)
-                ext4_error_inode(dir, function, line, bh->b_blocknr,
+                ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
-                        "bad entry in directory: %s - "
+                                "bad entry in directory: %s - offset=%u(%u), "
-                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+                                "inode=%u, rec_len=%d, name_len=%d",
-                        error_msg, (unsigned) (offset%bh->b_size), offset,
+                                error_msg, (unsigned) (offset%bh->b_size),
-                        le32_to_cpu(de->inode),
+                                offset, le32_to_cpu(de->inode),
-                        rlen, de->name_len);
+                                rlen, de->name_len);
-        return error_msg == NULL ? 1 : 0;
+        else
+                ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+                                "bad entry in directory: %s - offset=%u(%u), "
+                                "inode=%u, rec_len=%d, name_len=%d",
+                                error_msg, (unsigned) (offset%bh->b_size),
+                                offset, le32_to_cpu(de->inode),
+                                rlen, de->name_len);
+        return 1;
 }
 static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                EXT4_ERROR_INODE(inode, "directory "
+                                EXT4_ERROR_FILE(filp, 0,
-                                           "contains a hole at offset %Lu",
+                                                "directory contains a "
+                                                "hole at offset %llu",
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
@@ -194,8 +210,8 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (!ext4_check_dir_entry(inode, de,
+                        if (ext4_check_dir_entry(inode, filp, de,
-                                                  bh, offset)) {
+                                                 bh, offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
                                 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94ce3d7a1c4b..bab2387fba43 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
 #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)                 \
        ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
-#define EXT4_ERROR_FILE(file, fmt, a...)        \
+#define EXT4_ERROR_FILE(file, block, fmt, a...)                         \
-        ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
+        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -561,22 +561,6 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
 #endif
-/*
- *  Mount options
- */
-struct ext4_mount_options {
-        unsigned long s_mount_opt;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned long s_commit_interval;
-        u32 s_min_batch_time, s_max_batch_time;
-#ifdef CONFIG_QUOTA
-        int s_jquota_fmt;
-        char *s_qf_names[MAXQUOTAS];
-#endif
-};
 /* Max physical block we can addres w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF
@@ -709,6 +693,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
                ext4_decode_extra_time(&(inode)->xtime,                        \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (inode)->xtime.tv_nsec = 0;                                    \
 } while (0)
 #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                        \
@@ -719,6 +705,8 @@ do {									       \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))            \
                ext4_decode_extra_time(&(einode)->xtime,                       \
                                       raw_inode->xtime ## _extra);            \
+        else                                                                   \
+                (einode)->xtime.tv_nsec = 0;                                   \
 } while (0)
 #define i_disk_version osd1.linux1.l_i_version
@@ -750,12 +738,13 @@ do {									       \
 /*
 * storage for cached extent
+ * If ec_len == 0, then the cache is invalid.
+ * If ec_start == 0, then the cache represents a gap (null mapping)
 */
 struct ext4_ext_cache {
        ext4_fsblk_t    ec_start;
        ext4_lblk_t     ec_block;
        __u32           ec_len; /* must be 32bit to return holes */
-        __u32           ec_type;
 };
 /*
@@ -774,10 +763,12 @@ struct ext4_inode_info {
         * near to their parent directory's inode.
         */
        ext4_group_t    i_block_group;
+        ext4_lblk_t     i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
        unsigned long   i_state_flags;          /* Dynamic state flags */
+#endif
        unsigned long   i_flags;
-        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
@@ -820,7 +811,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
-        struct jbd2_inode jinode;
+        struct jbd2_inode *jinode;
        struct ext4_ext_cache i_cached_extent;
        /*
@@ -840,14 +831,12 @@ struct ext4_inode_info {
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
-        unsigned short i_delalloc_reserved_flag;
+        ext4_lblk_t i_da_metadata_calc_last_lblock;
-        sector_t i_da_metadata_calc_last_lblock;
        int i_da_metadata_calc_len;
        /* on-disk additional length */
        __u16 i_extra_isize;
-        spinlock_t i_block_reservation_lock;
 #ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
@@ -856,9 +845,11 @@ struct ext4_inode_info {
        /* completed IOs that might need unwritten extents handling */
        struct list_head i_completed_io_list;
        spinlock_t i_completed_io_lock;
+        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
-        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
+        spinlock_t i_block_reservation_lock;
        /*
         * Transactions that contain inode's metadata needed to complete
@@ -917,11 +908,20 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
-#define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
+#define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
-#define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
+                                                ~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
+                                                EXT4_MOUNT_##opt
 #define test_opt(sb, opt)               (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)
+#define clear_opt2(sb, opt)             EXT4_SB(sb)->s_mount_opt2 &= \
+                                                ~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)               EXT4_SB(sb)->s_mount_opt2 |= \
+                                                EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
+                                         EXT4_MOUNT2_##opt)
 #define ext4_set_bit                    ext2_set_bit
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
 #define ext4_clear_bit                  ext2_clear_bit
@@ -1087,6 +1087,7 @@ struct ext4_sb_info {
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
        struct buffer_head **s_group_desc;
        unsigned int s_mount_opt;
+        unsigned int s_mount_opt2;
        unsigned int s_mount_flags;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
@@ -1237,24 +1238,39 @@ enum {
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
+        EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
 };
-#define EXT4_INODE_BIT_FNS(name, field)                                 \
+#define EXT4_INODE_BIT_FNS(name, field, offset)                         \
 static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);     \
 }                                                                       \
 static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
 {                                                                       \
-        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);             \
 }                                                                       \
 static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
 {                                                                       \
-        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
+        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);           \
 }
-EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(flag, flags, 0)
-EXT4_INODE_BIT_FNS(state, state_flags)
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        (ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+        /* We depend on the fact that callers will set i_flags */
+}
+#endif
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1642,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, de, bh, offset) \
+#define ext4_check_dir_entry(dir, filp, de, bh, offset)                 \
-        __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
+        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+                                        (de), (bh), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1653,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
 extern int ext4_sync_file(struct file *, int);
+extern int ext4_flush_completed_IO(struct inode *);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1752,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
                             ext4_fsblk_t, const char *, ...)
        __attribute__ ((format (printf, 5, 6)));
 extern void ext4_error_file(struct file *, const char *, unsigned int,
-                            const char *, ...)
+                            ext4_fsblk_t, const char *, ...)
-        __attribute__ ((format (printf, 4, 5)));
+        __attribute__ ((format (printf, 5, 6)));
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
 extern void __ext4_abort(struct super_block *, const char *, unsigned int,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 28ce70fd9cd0..2e29abb30f76 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,10 +119,6 @@ struct ext4_ext_path {
 * structure for external API
 */
-#define EXT4_EXT_CACHE_NO       0
-#define EXT4_EXT_CACHE_GAP      1
-#define EXT4_EXT_CACHE_EXTENT   2
 /*
 * to be called by ext4_ext_walk_space()
 * negative retcode - error
@@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode)
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
-        EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
+        EXT4_I(inode)->i_cached_extent.ec_len = 0;
 }
 static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -278,7 +274,7 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
 }
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-                                         sector_t lblocks);
+                                         ext4_lblk_t lblocks);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c5..d8b992e658c1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
        if (ext4_handle_valid(handle))
-                return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+                return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
        return 0;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0554c48cb1fd..e910720e8bb8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -117,11 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                struct ext4_extent *ex;
                depth = path->p_depth;
-                /* try to predict block placement */
+                /*
+                 * Try to predict block placement assuming that we are
+                 * filling in a file which will eventually be
+                 * non-sparse --- i.e., in the case of libbfd writing
+                 * an ELF object sections out-of-order but in a way
+                 * the eventually results in a contiguous object or
+                 * executable file, or some database extending a table
+                 * space file.  However, this is actually somewhat
+                 * non-ideal if we are writing a sparse file such as
+                 * qemu or KVM writing a raw image file that is going
+                 * to stay fairly sparse, since it will end up
+                 * fragmenting the file system's free space.  Maybe we
+                 * should have some hueristics or some way to allow
+                 * userspace to pass a hint to file system,
+                 * especiially if the latter case turns out to be
+                 * common.
+                 */
                ex = path[depth].p_ext;
-                if (ex)
+                if (ex) {
-                        return (ext4_ext_pblock(ex) +
+                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
-                                (block - le32_to_cpu(ex->ee_block)));
+                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+                        if (block > ext_block)
+                                return ext_pblk + (block - ext_block);
+                        else
+                                return ext_pblk - (ext_block - block);
+                }
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@ -244,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 * to allocate @blocks
 * Worse case is one block per extent
 */
-int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        int idxs, num = 0;
@@ -1872,12 +1894,10 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        cbex.ec_block = start;
                        cbex.ec_len = end - start;
                        cbex.ec_start = 0;
-                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
                        cbex.ec_start = ext4_ext_pblock(ex);
-                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
                if (unlikely(cbex.ec_len == 0)) {
@@ -1917,13 +1937,12 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
-                        __u32 len, ext4_fsblk_t start, int type)
+                        __u32 len, ext4_fsblk_t start)
 {
        struct ext4_ext_cache *cex;
        BUG_ON(len == 0);
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
-        cex->ec_type = type;
        cex->ec_block = block;
        cex->ec_len = len;
        cex->ec_start = start;
@@ -1976,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
        }
        ext_debug(" -> %u:%lu\n", lblock, len);
-        ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+        ext4_ext_put_in_cache(inode, lblock, len, 0);
 }
+/*
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
 static int
 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                        struct ext4_extent *ex)
 {
        struct ext4_ext_cache *cex;
-        int ret = EXT4_EXT_CACHE_NO;
+        int ret = 0;
        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
@@ -1993,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        cex = &EXT4_I(inode)->i_cached_extent;
        /* has cache valid data? */
-        if (cex->ec_type == EXT4_EXT_CACHE_NO)
+        if (cex->ec_len == 0)
                goto errout;
-        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
-                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
        if (in_range(block, cex->ec_block, cex->ec_len)) {
                ex->ee_block = cpu_to_le32(cex->ec_block);
                ext4_ext_store_pblock(ex, cex->ec_start);
@@ -2005,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                ext_debug("%u cached by %u:%u:%llu\n",
                                block,
                                cex->ec_block, cex->ec_len, cex->ec_start);
-                ret = cex->ec_type;
+                ret = 1;
        }
 errout:
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -3082,7 +3102,7 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 * Handle EOFBLOCKS_FL flag, clearing it if necessary
 */
 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
-                              struct ext4_map_blocks *map,
+                              ext4_lblk_t lblk,
                              struct ext4_ext_path *path,
                              unsigned int len)
 {
@@ -3112,7 +3132,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
         * this turns out to be false, we can bail out from this
         * function immediately.
         */
-        if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
            ext4_ext_get_actual_len(last_ex))
                return 0;
        /*
@@ -3168,8 +3188,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                                                        path);
                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
-                        err = check_eofblocks_fl(handle, inode, map, path,
+                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
-                                                 map->m_len);
+                                                 path, map->m_len);
                } else
                        err = ret;
                goto out2;
@@ -3199,7 +3219,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
        if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
-                err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+                                         map->m_len);
                if (err < 0)
                        goto out2;
        }
@@ -3276,7 +3297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex;
        ext4_fsblk_t newblock;
-        int err = 0, depth, ret, cache_type;
+        int err = 0, depth, ret;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3285,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                  map->m_lblk, map->m_len, inode->i_ino);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
-        if (cache_type) {
+                if (!newex.ee_start_lo && !newex.ee_start_hi) {
-                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
@@ -3296,7 +3316,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                goto out2;
                        }
                        /* we should allocate requested block */
-                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
+                } else {
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
@@ -3305,8 +3325,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
-                } else {
-                        BUG();
                }
        }
@@ -3357,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* Do not put uninitialized extent in the cache */
                        if (!ext4_ext_is_uninitialized(ex)) {
                                ext4_ext_put_in_cache(inode, ee_block,
-                                                        ee_len, ee_start,
+                                                        ee_len, ee_start);
-                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
                        ret = ext4_ext_handle_uninitialized_extents(handle,
@@ -3456,7 +3473,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
        if (err)
                goto out2;
@@ -3490,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
-                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3519,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode)
        int err = 0;
        /*
+         * finish any pending end_io work so we won't run the risk of
+         * converting any truncated blocks to initialized later
+         */
+        ext4_flush_completed_IO(inode);
+        /*
         * probably first extent we're gonna free will be last in block
         */
        err = ext4_writepage_trans_blocks(inode);
@@ -3767,7 +3789,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
        logical =  (__u64)newex->ec_block << blksize_bits;
-        if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+        if (newex->ec_start == 0) {
                pgoff_t offset;
                struct page *page;
                struct buffer_head *bh = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5a5c55ddceef..bb003dc9ffff 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 {
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        struct ext4_inode_info *ei = EXT4_I(inode);
        struct vfsmount *mnt = filp->f_path.mnt;
        struct path path;
        char buf[64], *cp;
@@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                        ext4_mark_super_dirty(sb);
                }
        }
+        /*
+         * Set up the jbd2_inode if we are opening the inode for
+         * writing and the journal is present
+         */
+        if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+                struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+                spin_lock(&inode->i_lock);
+                if (!ei->jinode) {
+                        if (!jinode) {
+                                spin_unlock(&inode->i_lock);
+                                return -ENOMEM;
+                        }
+                        ei->jinode = jinode;
+                        jbd2_journal_init_jbd_inode(ei->jinode, inode);
+                        jinode = NULL;
+                }
+                spin_unlock(&inode->i_lock);
+                if (unlikely(jinode != NULL))
+                        jbd2_free_inode(jinode);
+        }
        return dquot_file_open(inode, filp);
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c1a7bc923cf6..7829b287822a 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
 * to written.
 * The function return the number of pending IOs on success.
 */
-static int flush_completed_IO(struct inode *inode)
+extern int ext4_flush_completed_IO(struct inode *inode)
 {
        ext4_io_end_t *io;
        struct ext4_inode_info *ei = EXT4_I(inode);
@@ -169,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync)
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
-        ret = flush_completed_IO(inode);
+        ret = ext4_flush_completed_IO(inode);
        if (ret < 0)
                return ret;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ce240a23ebb..eb9097aec6f0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1027,7 +1027,7 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
        ext4_set_inode_state(inode, EXT4_STATE_NEW);
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e659597b690b..e80fc513eacc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -40,6 +40,7 @@
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -54,10 +55,17 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
        trace_ext4_begin_ordered_truncate(inode, new_size);
-        return jbd2_journal_begin_ordered_truncate(
+        /*
-                                        EXT4_SB(inode->i_sb)->s_journal,
+         * If jinode is zero, then we never opened the file for
-                                        &EXT4_I(inode)->jinode,
+         * writing, so there's no need to call
-                                        new_size);
+         * jbd2_journal_begin_ordered_truncate() since there's no
+         * outstanding writes we need to flush.
+         */
+        if (!EXT4_I(inode)->jinode)
+                return 0;
+        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+                                                   EXT4_I(inode)->jinode,
+                                                   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -552,7 +560,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 }
 /**
- *      ext4_blks_to_allocate: Look up the block map and count the number
+ *      ext4_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -591,13 +599,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 /**
 *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: inode which needs allocated blocks
+ *      @iblock: the logical block to start allocated at
+ *      @goal: preferred physical block of allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks: number of desired blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: on return it will store the error code
- *              direct blocks
+ *
+ *      This function will return the number of blocks allocated as
+ *      requested by the passed-in parameters.
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                             ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -711,9 +725,11 @@ failed_out:
 /**
 *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -826,6 +842,7 @@ failed:
 /**
 * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
 * @chain: chain of indirect blocks (with a missing link - see
@@ -1081,7 +1098,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 * Calculate the number of metadata blocks need to reserve
 * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1320,7 +1337,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * avoid double accounting
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -1350,7 +1367,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1878,7 +1895,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
 * Reserve a single block located at lblock
 */
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2239,7 +2256,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the magic i_delalloc_reserved_flag once the
+         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
         * inode's allocation semaphore is taken.
         *
         * If the blocks in questions were delalloc blocks, set
@@ -3720,8 +3737,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 retry:
        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
        if (!io_end) {
-                if (printk_ratelimit())
+                pr_warning_ratelimited("%s: allocation fail\n", __func__);
-                        printk(KERN_WARNING "%s: allocation fail\n", __func__);
                schedule();
                goto retry;
        }
@@ -4045,7 +4061,7 @@ int ext4_block_truncate_page(handle_t *handle,
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
-                if (ext4_should_order_data(inode))
+                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -4169,6 +4185,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 {
        __le32 *p;
        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     err;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4184,11 +4201,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, inode, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(inode->i_sb, err);
+                                return 1;
+                        }
+                }
+                err = ext4_mark_inode_dirty(handle, inode);
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
+                }
+                err = ext4_truncate_restart_trans(handle, inode,
+                                                  blocks_for_truncate(inode));
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
                }
-                ext4_mark_inode_dirty(handle, inode);
-                ext4_truncate_restart_trans(handle, inode,
-                                            blocks_for_truncate(inode));
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext4_journal_get_write_access(handle, bh);
@@ -4349,6 +4378,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
+                        brelse(bh);
                        /*
                         * Everything below this this pointer has been
@@ -4859,7 +4889,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -5118,7 +5148,7 @@ static int ext4_do_update_inode(handle_t *handle,
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5b4d4e3a4d58..851f49b2f9d2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2608,18 +2608,12 @@ int ext4_mb_release(struct super_block *sb)
 static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
-        int ret;
        ext4_fsblk_t discard_block;
        discard_block = block + ext4_group_first_block_no(sb, block_group);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
-        ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-        if (ret == -EOPNOTSUPP) {
-                ext4_warning(sb, "discard not supported, disabling");
-                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-        }
-        return ret;
 }
 /*
@@ -2631,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct super_block *sb = journal->j_private;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
-        int err, count = 0, count2 = 0;
+        int err, ret, count = 0, count2 = 0;
        struct ext4_free_data *entry;
        struct list_head *l, *ltmp;
@@ -2641,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
-                if (test_opt(sb, DISCARD))
+                if (test_opt(sb, DISCARD)) {
-                        ext4_issue_discard(sb, entry->group,
+                        ret = ext4_issue_discard(sb, entry->group,
                                        entry->start_blk, entry->count);
+                        if (unlikely(ret == -EOPNOTSUPP)) {
+                                ext4_warning(sb, "discard not supported, "
+                                                 "disabling");
+                                clear_opt(sb, DISCARD);
+                        }
+                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -3881,19 +3881,6 @@ repeat:
        }
 }
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-                                        struct ext4_buddy *e4b,
-                                        sector_t block, int count)
-{
-        BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
 #ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
@@ -4283,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
         * EDQUOT check, as blocks and quotas have been already
         * reserved when data being copied into pagecache.
         */
-        if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+        if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
        else {
                /* Without delayed allocation we need to verify
@@ -4380,7 +4367,8 @@ out:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, inquota - ar->len);
        if (!ar->len) {
-                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                if (!ext4_test_inode_state(ar->inode,
+                                           EXT4_STATE_DELALLOC_RESERVED))
                        /* release all the reserved blocks if non delalloc */
                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
                                                reserv_blks);
@@ -4626,7 +4614,11 @@ do_more:
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
                 */
-                new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                if (!new_entry) {
+                        err = -ENOMEM;
+                        goto error_return;
+                }
                new_entry->start_blk = bit;
                new_entry->group  = block_group;
                new_entry->count = count;
@@ -4643,7 +4635,6 @@ do_more:
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
-                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4718,8 +4709,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
        ext4_unlock_group(sb, group);
        ret = ext4_issue_discard(sb, group, start, count);
-        if (ret)
-                ext4_std_error(sb, ret);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
@@ -4819,6 +4808,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
        ext4_grpblk_t cnt = 0, first_block, last_block;
        uint64_t start, len, minlen, trimmed;
+        ext4_fsblk_t first_data_blk =
+                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
        int ret = 0;
        start = range->start >> sb->s_blocksize_bits;
@@ -4828,6 +4819,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
                return -EINVAL;
+        if (start < first_data_blk) {
+                len -= first_data_blk - start;
+                start = first_data_blk;
+        }
        /* Determine first and last group to examine based on start and len */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
@@ -4851,7 +4846,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                if (len >= EXT4_BLOCKS_PER_GROUP(sb))
                        len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
                else
-                        last_block = len;
+                        last_block = first_block + len;
                if (e4b.bd_info->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 25f3a974b725..b0a126f23c20 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-                                   S_IFREG, 0, goal);
+                                   S_IFREG, NULL, goal);
        if (IS_ERR(tmp_inode)) {
                retval = -ENOMEM;
                ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index dc40e75cba88..5485390d32c5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-                if (!ext4_check_dir_entry(dir, de, bh,
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
-                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
-                                                +((char *)de - bh->b_data))) {
+                                         + ((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
                        dir_file->f_pos = (dir_file->f_pos |
                                        (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -1036,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
-                if (unlikely(IS_ERR(inode))) {
+                if (IS_ERR(inode)) {
                        if (PTR_ERR(inode) == -ESTALE) {
                                EXT4_ERROR_INODE(dir,
                                                 "deleted inode referenced: %u",
@@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *)bh->b_data;
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
-                        if (!ext4_check_dir_entry(dir, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -EIO;
                        if (ext4_match(namelen, name, de))
                                return -EEXIST;
@@ -1602,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+                if (err) {
+                        ext4_std_error(inode->i_sb, err);
+                        goto cleanup;
+                }
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1630,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle,
 {
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
-        int i;
+        int i, err;
        i = 0;
        pde = NULL;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        while (i < bh->b_size) {
-                if (!ext4_check_dir_entry(dir, de, bh, i))
+                if (ext4_check_dir_entry(dir, NULL, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
                        BUFFER_TRACE(bh, "get_write_access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
                                        ext4_rec_len_from_disk(pde->rec_len,
@@ -1652,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, dir, bh);
+                        err = ext4_handle_dirty_metadata(handle, dir, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(dir->i_sb, err);
+                                return err;
+                        }
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1789,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        handle_t *handle;
        struct inode *inode;
-        struct buffer_head *dir_block;
+        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
@@ -1822,7 +1834,9 @@ retry:
        if (!dir_block)
                goto out_clear_inode;
        BUFFER_TRACE(dir_block, "get_write_access");
-        ext4_journal_get_write_access(handle, dir_block);
+        err = ext4_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out_clear_inode;
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
@@ -1839,10 +1853,12 @@ retry:
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        ext4_handle_dirty_metadata(handle, dir, dir_block);
+        err = ext4_handle_dirty_metadata(handle, dir, dir_block);
-        brelse(dir_block);
+        if (err)
-        ext4_mark_inode_dirty(handle, inode);
+                goto out_clear_inode;
-        err = ext4_add_entry(handle, dentry, inode);
+        err = ext4_mark_inode_dirty(handle, inode);
+        if (!err)
+                err = ext4_add_entry(handle, dentry, inode);
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
@@ -1853,10 +1869,13 @@ out_clear_inode:
        }
        ext4_inc_count(handle, dir);
        ext4_update_dx_flag(dir);
-        ext4_mark_inode_dirty(handle, dir);
+        err = ext4_mark_inode_dirty(handle, dir);
+        if (err)
+                goto out_clear_inode;
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
 out_stop:
+        brelse(dir_block);
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -1919,7 +1938,7 @@ static int empty_dir(struct inode *inode)
                        }
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
-                if (!ext4_check_dir_entry(inode, de, bh, offset)) {
+                if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
                                                         sb->s_blocksize);
                        offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2407,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+                if (unlikely(retval)) {
+                        ext4_std_error(new_dir->i_sb, retval);
+                        goto end_rename;
+                }
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2459,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+                if (retval) {
+                        ext4_std_error(old_dir->i_sb, retval);
+                        goto end_rename;
+                }
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index beacce11ac50..7270dcfca92a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -44,7 +44,7 @@ int __init ext4_init_pageio(void)
        if (io_page_cachep == NULL)
                return -ENOMEM;
        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
-        if (io_page_cachep == NULL) {
+        if (io_end_cachep == NULL) {
                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
        }
@@ -158,11 +158,8 @@ static void ext4_end_io_work(struct work_struct *work)
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 {
-        ext4_io_end_t *io = NULL;
+        ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
-        io = kmem_cache_alloc(io_end_cachep, flags);
        if (io) {
-                memset(io, 0, sizeof(*io));
                atomic_inc(&EXT4_I(inode)->i_ioend_count);
                io->inode = inode;
                INIT_WORK(&io->work, ext4_end_io_work);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 981c8477adab..3ecc6e45d2f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,7 +220,11 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
+                err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+                if (unlikely(err)) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -258,7 +262,11 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_bh;
+        }
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -270,7 +278,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                             bh->b_data);
-        ext4_handle_dirty_metadata(handle, NULL, bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        if (unlikely(err))
+                ext4_std_error(sb, err);
 exit_bh:
        brelse(bh);
@@ -422,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                goto exit_dind;
        }
-        if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+        if (unlikely(err))
                goto exit_dind;
-        if ((err = ext4_journal_get_write_access(handle, *primary)))
+        err = ext4_journal_get_write_access(handle, *primary);
+        if (unlikely(err))
                goto exit_sbh;
-        if ((err = ext4_journal_get_write_access(handle, dind)))
+        err = ext4_journal_get_write_access(handle, dind);
-                goto exit_primary;
+        if (unlikely(err))
+                ext4_std_error(sb, err);
        /* ext4_reserve_inode_write() gets a reference on the iloc */
-        if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (unlikely(err))
                goto exit_dindj;
        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -454,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-        ext4_handle_dirty_metadata(handle, NULL, dind);
+        err = ext4_handle_dirty_metadata(handle, NULL, dind);
-        brelse(dind);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext4_handle_dirty_metadata(handle, NULL, *primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, *primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_inode;
+        }
+        brelse(dind);
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -470,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+        if (err)
+                ext4_std_error(sb, err);
-        return 0;
+        return err;
 exit_inode:
        /* ext4_journal_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
        /* ext4_journal_release_buffer(handle, dind); */
-exit_primary:
-        /* ext4_journal_release_buffer(handle, *primary); */
 exit_sbh:
-        /* ext4_journal_release_buffer(handle, *primary); */
+        /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -665,7 +687,9 @@ static void update_backups(struct super_block *sb,
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext4_handle_dirty_metadata(handle, NULL, bh);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
+                if (unlikely(err))
+                        ext4_std_error(sb, err);
                brelse(bh);
        }
        if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -883,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext4_handle_dirty_metadata(handle, NULL, primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, primary);
+        if (unlikely(err)) {
+                ext4_std_error(sb, err);
+                goto exit_journal;
+        }
        /* Update the reserved block counts only once the new group is
         * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cd37f9d5e447..29c80f6d8b27 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -388,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb)
 void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line, current->comm);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, current->comm, &vaf);
        va_end(args);
        ext4_handle_error(sb);
@@ -405,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
                      const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        es->s_last_error_block = cpu_to_le64(block);
        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
               inode->i_sb->s_id, function, line, inode->i_ino);
        if (block)
-                printk("block %llu: ", block);
+                printk(KERN_CONT "block %llu: ", block);
-        printk("comm %s: ", current->comm);
+        printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
-        vprintk(fmt, args);
-        printk("\n");
        va_end(args);
        ext4_handle_error(inode->i_sb);
 }
 void ext4_error_file(struct file *file, const char *function,
-                     unsigned int line, const char *fmt, ...)
+                     unsigned int line, ext4_fsblk_t block,
+                     const char *fmt, ...)
 {
        va_list args;
+        struct va_format vaf;
        struct ext4_super_block *es;
        struct inode *inode = file->f_dentry->d_inode;
        char pathname[80], *path;
@@ -434,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function,
        es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        save_error_info(inode->i_sb, function, line);
-        va_start(args, fmt);
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
-        if (!path)
+        if (IS_ERR(path))
                path = "(unknown)";
        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s:%d: inode #%lu "
+               "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               "(comm %s path %s): ",
+               inode->i_sb->s_id, function, line, inode->i_ino);
-               inode->i_sb->s_id, function, line, inode->i_ino,
+        if (block)
-               current->comm, path);
+                printk(KERN_CONT "block %llu: ", block);
-        vprintk(fmt, args);
+        va_start(args, fmt);
-        printk("\n");
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
        va_end(args);
        ext4_handle_error(inode->i_sb);
@@ -543,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
                panic("EXT4-fs panic from previous error\n");
 }
-void ext4_msg (struct super_block * sb, const char *prefix,
+void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        va_end(args);
 }
 void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
+        vaf.fmt = fmt;
-               sb->s_id, function, line);
+        vaf.va = &args;
-        vprintk(fmt, args);
+        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
-        printk("\n");
+               sb->s_id, function, line, &vaf);
        va_end(args);
 }
@@ -575,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
 __releases(bitlock)
 __acquires(bitlock)
 {
+        struct va_format vaf;
        va_list args;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(ino);
        es->s_last_error_block = cpu_to_le64(block);
        __save_error_info(sb, function, line);
        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
               sb->s_id, function, line, grp);
        if (ino)
-                printk("inode %lu: ", ino);
+                printk(KERN_CONT "inode %lu: ", ino);
        if (block)
-                printk("block %llu:", (unsigned long long) block);
+                printk(KERN_CONT "block %llu:", (unsigned long long) block);
-        vprintk(fmt, args);
+        printk(KERN_CONT "%pV\n", &vaf);
-        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_CONT)) {
@@ -808,21 +818,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
-        /*
-         * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
-         * therefore it can be null here.  Don't check it, just initialize
-         * jinode.
-         */
-        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
        ei->i_da_metadata_calc_len = 0;
-        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
 #endif
+        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_completed_io_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->cur_aio_dio = NULL;
@@ -898,9 +902,12 @@ void ext4_clear_inode(struct inode *inode)
        end_writeback(inode);
        dquot_drop(inode);
        ext4_discard_preallocations(inode);
-        if (EXT4_JOURNAL(inode))
+        if (EXT4_I(inode)->jinode) {
-                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
-                                       &EXT4_I(inode)->jinode);
+                                               EXT4_I(inode)->jinode);
+                jbd2_free_inode(EXT4_I(inode)->jinode);
+                EXT4_I(inode)->jinode = NULL;
+        }
 }
 static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -1393,7 +1400,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
                sbi->s_qf_names[qtype] = NULL;
                return 0;
        }
-        set_opt(sbi->s_mount_opt, QUOTA);
+        set_opt(sb, QUOTA);
        return 1;
 }
@@ -1448,21 +1455,21 @@ static int parse_options(char *options, struct super_block *sb,
                switch (token) {
                case Opt_bsd_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, MINIX_DF);
+                        clear_opt(sb, MINIX_DF);
                        break;
                case Opt_minix_df:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, MINIX_DF);
+                        set_opt(sb, MINIX_DF);
                        break;
                case Opt_grpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        set_opt(sbi->s_mount_opt, GRPID);
+                        set_opt(sb, GRPID);
                        break;
                case Opt_nogrpid:
                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-                        clear_opt(sbi->s_mount_opt, GRPID);
+                        clear_opt(sb, GRPID);
                        break;
                case Opt_resuid:
@@ -1480,38 +1487,38 @@ static int parse_options(char *options, struct super_block *sb,
                        /* *sb_block = match_int(&args[0]); */
                        break;
                case Opt_err_panic:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt(sb, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sb, ERRORS_CONT);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_RO);
+                        set_opt(sb, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sb, ERRORS_RO);
-                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sb, ERRORS_PANIC);
-                        set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                        set_opt(sb, ERRORS_CONT);
                        break;
                case Opt_nouid32:
-                        set_opt(sbi->s_mount_opt, NO_UID32);
+                        set_opt(sb, NO_UID32);
                        break;
                case Opt_debug:
-                        set_opt(sbi->s_mount_opt, DEBUG);
+                        set_opt(sb, DEBUG);
                        break;
                case Opt_oldalloc:
-                        set_opt(sbi->s_mount_opt, OLDALLOC);
+                        set_opt(sb, OLDALLOC);
                        break;
                case Opt_orlov:
-                        clear_opt(sbi->s_mount_opt, OLDALLOC);
+                        clear_opt(sb, OLDALLOC);
                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
                case Opt_user_xattr:
-                        set_opt(sbi->s_mount_opt, XATTR_USER);
+                        set_opt(sb, XATTR_USER);
                        break;
                case Opt_nouser_xattr:
-                        clear_opt(sbi->s_mount_opt, XATTR_USER);
+                        clear_opt(sb, XATTR_USER);
                        break;
 #else
                case Opt_user_xattr:
@@ -1521,10 +1528,10 @@ static int parse_options(char *options, struct super_block *sb,
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
                case Opt_acl:
-                        set_opt(sbi->s_mount_opt, POSIX_ACL);
+                        set_opt(sb, POSIX_ACL);
                        break;
                case Opt_noacl:
-                        clear_opt(sbi->s_mount_opt, POSIX_ACL);
+                        clear_opt(sb, POSIX_ACL);
                        break;
 #else
                case Opt_acl:
@@ -1543,7 +1550,7 @@ static int parse_options(char *options, struct super_block *sb,
                                         "Cannot specify journal on remount");
                                return 0;
                        }
-                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
+                        set_opt(sb, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
@@ -1556,14 +1563,14 @@ static int parse_options(char *options, struct super_block *sb,
                        *journal_devnum = option;
                        break;
                case Opt_journal_checksum:
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_journal_async_commit:
-                        set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+                        set_opt(sb, JOURNAL_ASYNC_COMMIT);
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        set_opt(sb, JOURNAL_CHECKSUM);
                        break;
                case Opt_noload:
-                        set_opt(sbi->s_mount_opt, NOLOAD);
+                        set_opt(sb, NOLOAD);
                        break;
                case Opt_commit:
                        if (match_int(&args[0], &option))
@@ -1606,15 +1613,15 @@ static int parse_options(char *options, struct super_block *sb,
                                        return 0;
                                }
                        } else {
-                                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                                clear_opt(sb, DATA_FLAGS);
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
                case Opt_data_err_abort:
-                        set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        set_opt(sb, DATA_ERR_ABORT);
                        break;
                case Opt_data_err_ignore:
-                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        clear_opt(sb, DATA_ERR_ABORT);
                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
@@ -1654,12 +1661,12 @@ set_qf_format:
                        break;
                case Opt_quota:
                case Opt_usrquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, USRQUOTA);
+                        set_opt(sb, USRQUOTA);
                        break;
                case Opt_grpquota:
-                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sb, QUOTA);
-                        set_opt(sbi->s_mount_opt, GRPQUOTA);
+                        set_opt(sb, GRPQUOTA);
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
@@ -1667,9 +1674,9 @@ set_qf_format:
                                        "options when quota turned on");
                                return 0;
                        }
-                        clear_opt(sbi->s_mount_opt, QUOTA);
+                        clear_opt(sb, QUOTA);
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                        break;
 #else
                case Opt_quota:
@@ -1695,7 +1702,7 @@ set_qf_format:
                        sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
                        break;
                case Opt_nobarrier:
-                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        clear_opt(sb, BARRIER);
                        break;
                case Opt_barrier:
                        if (args[0].from) {
@@ -1704,9 +1711,9 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                set_opt(sbi->s_mount_opt, BARRIER);
+                                set_opt(sb, BARRIER);
                        else
-                                clear_opt(sbi->s_mount_opt, BARRIER);
+                                clear_opt(sb, BARRIER);
                        break;
                case Opt_ignore:
                        break;
@@ -1730,17 +1737,17 @@ set_qf_format:
                                 "Ignoring deprecated bh option");
                        break;
                case Opt_i_version:
-                        set_opt(sbi->s_mount_opt, I_VERSION);
+                        set_opt(sb, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
                case Opt_nodelalloc:
-                        clear_opt(sbi->s_mount_opt, DELALLOC);
+                        clear_opt(sb, DELALLOC);
                        break;
                case Opt_mblk_io_submit:
-                        set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        set_opt(sb, MBLK_IO_SUBMIT);
                        break;
                case Opt_nomblk_io_submit:
-                        clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        clear_opt(sb, MBLK_IO_SUBMIT);
                        break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
@@ -1750,13 +1757,13 @@ set_qf_format:
                        sbi->s_stripe = option;
                        break;
                case Opt_delalloc:
-                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        set_opt(sb, DELALLOC);
                        break;
                case Opt_block_validity:
-                        set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        set_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_noblock_validity:
-                        clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        clear_opt(sb, BLOCK_VALIDITY);
                        break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
@@ -1780,7 +1787,7 @@ set_qf_format:
                                                            option);
                        break;
                case Opt_noauto_da_alloc:
-                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        set_opt(sb, NO_AUTO_DA_ALLOC);
                        break;
                case Opt_auto_da_alloc:
                        if (args[0].from) {
@@ -1789,24 +1796,24 @@ set_qf_format:
                        } else
                                option = 1;     /* No argument, default to 1 */
                        if (option)
-                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                                clear_opt(sb, NO_AUTO_DA_ALLOC);
                        else
-                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                                set_opt(sb,NO_AUTO_DA_ALLOC);
                        break;
                case Opt_discard:
-                        set_opt(sbi->s_mount_opt, DISCARD);
+                        set_opt(sb, DISCARD);
                        break;
                case Opt_nodiscard:
-                        clear_opt(sbi->s_mount_opt, DISCARD);
+                        clear_opt(sb, DISCARD);
                        break;
                case Opt_dioread_nolock:
-                        set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        set_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_dioread_lock:
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                        break;
                case Opt_init_inode_table:
-                        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        set_opt(sb, INIT_INODE_TABLE);
                        if (args[0].from) {
                                if (match_int(&args[0], &option))
                                        return 0;
@@ -1817,7 +1824,7 @@ set_qf_format:
                        sbi->s_li_wait_mult = option;
                        break;
                case Opt_noinit_inode_table:
-                        clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        clear_opt(sb, INIT_INODE_TABLE);
                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
@@ -1829,10 +1836,10 @@ set_qf_format:
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
                if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sb, USRQUOTA);
                if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        clear_opt(sb, GRPQUOTA);
                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
                        ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1902,12 +1909,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-                                "bpg=%lu, ipg=%lu, mo=%04x]\n",
+                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
-                        sbi->s_mount_opt);
+                        sbi->s_mount_opt, sbi->s_mount_opt2);
        return res;
 }
@@ -1937,14 +1944,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
        size = flex_group_count * sizeof(struct flex_groups);
        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
-                sbi->s_flex_groups = vmalloc(size);
+                sbi->s_flex_groups = vzalloc(size);
-                if (sbi->s_flex_groups)
+                if (sbi->s_flex_groups == NULL) {
-                        memset(sbi->s_flex_groups, 0, size);
+                        ext4_msg(sb, KERN_ERR,
-        }
+                                 "not enough memory for %u flex groups",
-        if (sbi->s_flex_groups == NULL) {
+                                 flex_group_count);
-                ext4_msg(sb, KERN_ERR, "not enough memory for "
+                        goto failed;
-                                "%u flex groups", flex_group_count);
+                }
-                goto failed;
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2923,7 +2929,7 @@ static int ext4_register_li_request(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_li_request *elr;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
-        int ret;
+        int ret = 0;
        if (sbi->s_li_request != NULL)
                return 0;
@@ -3078,41 +3084,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
-                set_opt(sbi->s_mount_opt, DEBUG);
+                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
                ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
                        "2.6.38");
-                set_opt(sbi->s_mount_opt, GRPID);
+                set_opt(sb, GRPID);
        }
        if (def_mount_opts & EXT4_DEFM_UID16)
-                set_opt(sbi->s_mount_opt, NO_UID32);
+                set_opt(sb, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
-                set_opt(sbi->s_mount_opt, XATTR_USER);
+                set_opt(sb, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (def_mount_opts & EXT4_DEFM_ACL)
-                set_opt(sbi->s_mount_opt, POSIX_ACL);
+                set_opt(sb, POSIX_ACL);
 #endif
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-                set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-                set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
-                set_opt(sbi->s_mount_opt, ERRORS_CONT);
+                set_opt(sb, ERRORS_CONT);
        else
-                set_opt(sbi->s_mount_opt, ERRORS_RO);
+                set_opt(sb, ERRORS_RO);
        if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
-                set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
-                set_opt(sbi->s_mount_opt, DISCARD);
+                set_opt(sb, DISCARD);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -3121,7 +3127,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
-                set_opt(sbi->s_mount_opt, BARRIER);
+                set_opt(sb, BARRIER);
        /*
         * enable delayed allocation by default
@@ -3129,7 +3135,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        if (!IS_EXT3_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
-                set_opt(sbi->s_mount_opt, DELALLOC);
+                set_opt(sb, DELALLOC);
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                           &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -3432,8 +3438,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "suppressed and not mounted read-only");
                goto failed_mount_wq;
        } else {
-                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+                clear_opt(sb, DATA_FLAGS);
-                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+                set_opt(sb, WRITEBACK_DATA);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
@@ -3471,9 +3477,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
-                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                        set_opt(sb, ORDERED_DATA);
                else
-                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                        set_opt(sb, JOURNAL_DATA);
                break;
        case EXT4_MOUNT_ORDERED_DATA:
@@ -3563,18 +3569,18 @@ no_journal:
            (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
                         "requested data journaling mode");
-                clear_opt(sbi->s_mount_opt, DELALLOC);
+                clear_opt(sb, DELALLOC);
        }
        if (test_opt(sb, DIOREAD_NOLOCK)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - requested data journaling mode");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
                if (sb->s_blocksize < PAGE_SIZE) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
                                "option - block size is too small");
-                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        clear_opt(sb, DIOREAD_NOLOCK);
                }
        }
@@ -4173,6 +4179,22 @@ static int ext4_unfreeze(struct super_block *sb)
        return 0;
 }
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+        unsigned long s_mount_opt;
+        unsigned long s_mount_opt2;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned long s_commit_interval;
+        u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+        int s_jquota_fmt;
+        char *s_qf_names[MAXQUOTAS];
+#endif
+};
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ext4_super_block *es;
@@ -4193,6 +4215,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
+        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -4346,6 +4369,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
+        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fa4b899da4b3..fc32176eee39 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
 static int
 ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        int i_error, b_error;
+        int ret, ret2;
        down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+        ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
-        if (i_error < 0) {
+        if (ret < 0)
-                b_error = 0;
+                goto errout;
-        } else {
+        if (buffer) {
-                if (buffer) {
+                buffer += ret;
-                        buffer += i_error;
+                buffer_size -= ret;
-                        buffer_size -= i_error;
-                }
-                b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
-                if (b_error < 0)
-                        i_error = 0;
        }
+        ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+        if (ret < 0)
+                goto errout;
+        ret += ret2;
+errout:
        up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        return i_error + b_error;
+        return ret;
 }
 /*
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
 * ext4_xattr_set_handle()
 *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6e07696308dc..cf8d28d1fbad 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+                       u64 nodeid, u64 nlookup)
+{
+        forget->forget_one.nodeid = nodeid;
+        forget->forget_one.nlookup = nlookup;
+        spin_lock(&fc->lock);
+        fc->forget_list_tail->next = forget;
+        fc->forget_list_tail = forget;
+        wake_up(&fc->waitq);
+        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+        spin_unlock(&fc->lock);
+}
 static void flush_bg_queue(struct fuse_conn *fc)
 {
        while (fc->active_background < fc->max_background &&
@@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
-{
-        req->isreply = 0;
-        fuse_request_send_nowait(fc, req);
-}
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
@@ -896,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
        return err;
 }
+static int forget_pending(struct fuse_conn *fc)
+{
+        return fc->forget_list_head.next != NULL;
+}
 static int request_pending(struct fuse_conn *fc)
 {
-        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
+                forget_pending(fc);
 }
 /* Wait until a request is available on the pending list */
@@ -960,6 +974,120 @@ __releases(fc->lock)
        return err ? err : reqsize;
 }
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+                                               unsigned max,
+                                               unsigned *countp)
+{
+        struct fuse_forget_link *head = fc->forget_list_head.next;
+        struct fuse_forget_link **newhead = &head;
+        unsigned count;
+        for (count = 0; *newhead != NULL && count < max; count++)
+                newhead = &(*newhead)->next;
+        fc->forget_list_head.next = *newhead;
+        *newhead = NULL;
+        if (fc->forget_list_head.next == NULL)
+                fc->forget_list_tail = &fc->forget_list_head;
+        if (countp != NULL)
+                *countp = count;
+        return head;
+}
+static int fuse_read_single_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs,
+                                   size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+        struct fuse_forget_in arg = {
+                .nlookup = forget->forget_one.nlookup,
+        };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_FORGET,
+                .nodeid = forget->forget_one.nodeid,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        spin_unlock(&fc->lock);
+        kfree(forget);
+        if (nbytes < ih.len)
+                return -EINVAL;
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+                                   struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+        int err;
+        unsigned max_forgets;
+        unsigned count;
+        struct fuse_forget_link *head;
+        struct fuse_batch_forget_in arg = { .count = 0 };
+        struct fuse_in_header ih = {
+                .opcode = FUSE_BATCH_FORGET,
+                .unique = fuse_get_unique(fc),
+                .len = sizeof(ih) + sizeof(arg),
+        };
+        if (nbytes < ih.len) {
+                spin_unlock(&fc->lock);
+                return -EINVAL;
+        }
+        max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+        head = dequeue_forget(fc, max_forgets, &count);
+        spin_unlock(&fc->lock);
+        arg.count = count;
+        ih.len += count * sizeof(struct fuse_forget_one);
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
+        while (head) {
+                struct fuse_forget_link *forget = head;
+                if (!err) {
+                        err = fuse_copy_one(cs, &forget->forget_one,
+                                            sizeof(forget->forget_one));
+                }
+                head = forget->next;
+                kfree(forget);
+        }
+        fuse_copy_finish(cs);
+        if (err)
+                return err;
+        return ih.len;
+}
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+                            size_t nbytes)
+__releases(fc->lock)
+{
+        if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+                return fuse_read_single_forget(fc, cs, nbytes);
+        else
+                return fuse_read_batch_forget(fc, cs, nbytes);
+}
 /*
 * Read a single request into the userspace filesystem's buffer.  This
 * function waits until a request is available, then removes it from
@@ -998,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
                return fuse_read_interrupt(fc, cs, nbytes, req);
        }
+        if (forget_pending(fc)) {
+                if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
+                        return fuse_read_forget(fc, cs, nbytes);
+                if (fc->forget_batch <= -8)
+                        fc->forget_batch = 16;
+        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
        req->state = FUSE_REQ_READING;
        list_move(&req->list, &fc->io);
@@ -1090,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1626,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
        if (!fc)
                return -EPERM;
-        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
@@ -1770,6 +1906,8 @@ __acquires(fc->lock)
        flush_bg_queue(fc);
        end_requests(fc, &fc->pending);
        end_requests(fc, &fc->processing);
+        while (forget_pending(fc))
+                kfree(dequeue_forget(fc, 1, NULL));
 }
 /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f738599fd8cd..042af7346ec1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,9 +10,9 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
-#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #if BITS_PER_LONG >= 64
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
@@ -169,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                struct fuse_entry_out outarg;
                struct fuse_conn *fc;
                struct fuse_req *req;
-                struct fuse_req *forget_req;
+                struct fuse_forget_link *forget;
                struct dentry *parent;
                u64 attr_version;
@@ -182,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (IS_ERR(req))
                        return 0;
-                forget_req = fuse_get_req(fc);
+                forget = fuse_alloc_forget();
-                if (IS_ERR(forget_req)) {
+                if (!forget) {
                        fuse_put_request(fc, req);
                        return 0;
                }
@@ -203,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (!err) {
                        struct fuse_inode *fi = get_fuse_inode(inode);
                        if (outarg.nodeid != get_node_id(inode)) {
-                                fuse_send_forget(fc, forget_req,
+                                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
-                                                 outarg.nodeid, 1);
                                return 0;
                        }
                        spin_lock(&fc->lock);
                        fi->nlookup++;
                        spin_unlock(&fc->lock);
                }
-                fuse_put_request(fc, forget_req);
+                kfree(forget);
                if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
                        return 0;
@@ -263,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 {
        struct fuse_conn *fc = get_fuse_conn_super(sb);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        u64 attr_version;
        int err;
@@ -277,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
        if (IS_ERR(req))
                goto out;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        err = PTR_ERR(forget_req);
+        err = -ENOMEM;
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
                goto out;
        }
@@ -305,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
                           attr_version);
        err = -ENOMEM;
        if (!*inode) {
-                fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg->nodeid, 1);
                goto out;
        }
        err = 0;
 out_put_forget:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
 out:
        return err;
 }
@@ -378,7 +377,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct inode *inode;
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        struct fuse_create_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
@@ -392,9 +391,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (flags & O_DIRECT)
                return -EINVAL;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req))
+        if (!forget)
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        req = fuse_get_req(fc);
        err = PTR_ERR(req);
@@ -452,10 +451,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!inode) {
                flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
                fuse_sync_release(ff, flags);
-                fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
+                fuse_queue_forget(fc, forget, outentry.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        d_instantiate(entry, inode);
        fuse_change_entry_timeout(entry, &outentry);
        fuse_invalidate_attr(dir);
@@ -473,7 +472,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 out_put_request:
        fuse_put_request(fc, req);
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
@@ -487,12 +486,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        struct fuse_entry_out outarg;
        struct inode *inode;
        int err;
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
-        forget_req = fuse_get_req(fc);
+        forget = fuse_alloc_forget();
-        if (IS_ERR(forget_req)) {
+        if (!forget) {
                fuse_put_request(fc, req);
-                return PTR_ERR(forget_req);
+                return -ENOMEM;
        }
        memset(&outarg, 0, sizeof(outarg));
@@ -519,10 +518,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
                          &outarg.attr, entry_attr_timeout(&outarg), 0);
        if (!inode) {
-                fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
+                fuse_queue_forget(fc, forget, outarg.nodeid, 1);
                return -ENOMEM;
        }
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *alias;
@@ -545,7 +544,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        return 0;
 out_put_forget_req:
-        fuse_put_request(fc, forget_req);
+        kfree(forget);
        return err;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8b984a2cebbd..95da1bc1c826 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1634,9 +1634,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 * and 64bit.  Fortunately we can determine which structure the server
 * used from the size of the reply.
 */
-static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
-                                 size_t transferred, unsigned count,
+                                     size_t transferred, unsigned count,
-                                 bool is_compat)
+                                     bool is_compat)
 {
 #ifdef CONFIG_COMPAT
        if (count * sizeof(struct compat_iovec) == transferred) {
@@ -1680,6 +1680,42 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
        return 0;
 }
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+                                 void *src, size_t transferred, unsigned count,
+                                 bool is_compat)
+{
+        unsigned i;
+        struct fuse_ioctl_iovec *fiov = src;
+        if (fc->minor < 16) {
+                return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+                                                 count, is_compat);
+        }
+        if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+                return -EIO;
+        for (i = 0; i < count; i++) {
+                /* Did the server supply an inappropriate value? */
+                if (fiov[i].base != (unsigned long) fiov[i].base ||
+                    fiov[i].len != (unsigned long) fiov[i].len)
+                        return -EIO;
+                dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+                dst[i].iov_len = (size_t) fiov[i].len;
+#ifdef CONFIG_COMPAT
+                if (is_compat &&
+                    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+                     (compat_size_t) dst[i].iov_len != fiov[i].len))
+                        return -EIO;
+#endif
+        }
+        return 0;
+}
 /*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
@@ -1740,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        struct fuse_ioctl_out outarg;
        struct fuse_req *req = NULL;
        struct page **pages = NULL;
-        struct page *iov_page = NULL;
+        struct iovec *iov_page = NULL;
        struct iovec *in_iov = NULL, *out_iov = NULL;
        unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
        size_t in_size, out_size, transferred;
        int err;
+#if BITS_PER_LONG == 32
+        inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+        if (flags & FUSE_IOCTL_COMPAT)
+                inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
        /* assume all the iovs returned by client always fits in a page */
-        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+        BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
        err = -ENOMEM;
        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
-        iov_page = alloc_page(GFP_KERNEL);
+        iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
        if (!pages || !iov_page)
                goto out;
@@ -1760,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
         * RETRY from server is not allowed.
         */
        if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
-                struct iovec *iov = page_address(iov_page);
+                struct iovec *iov = iov_page;
                iov->iov_base = (void __user *)arg;
                iov->iov_len = _IOC_SIZE(cmd);
@@ -1841,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        /* did it ask for retry? */
        if (outarg.flags & FUSE_IOCTL_RETRY) {
-                char *vaddr;
+                void *vaddr;
                /* no retry if in restricted mode */
                err = -EIO;
@@ -1862,14 +1905,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                        goto out;
                vaddr = kmap_atomic(pages[0], KM_USER0);
-                err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
+                err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
                                            transferred, in_iovs + out_iovs,
                                            (flags & FUSE_IOCTL_COMPAT) != 0);
                kunmap_atomic(vaddr, KM_USER0);
                if (err)
                        goto out;
-                in_iov = page_address(iov_page);
+                in_iov = iov_page;
                out_iov = in_iov + in_iovs;
                err = fuse_verify_ioctl_iov(in_iov, in_iovs);
@@ -1891,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 out:
        if (req)
                fuse_put_request(fc, req);
-        if (iov_page)
+        free_page((unsigned long) iov_page);
-                __free_page(iov_page);
        while (num_pages)
                __free_page(pages[--num_pages]);
        kfree(pages);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 57d4a3a0f102..ae5744a2f9e9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -53,6 +53,12 @@ extern struct mutex fuse_mutex;
 extern unsigned max_user_bgreq;
 extern unsigned max_user_congthresh;
+/* One forget request */
+struct fuse_forget_link {
+        struct fuse_forget_one forget_one;
+        struct fuse_forget_link *next;
+};
 /** FUSE inode */
 struct fuse_inode {
        /** Inode data */
@@ -66,7 +72,7 @@ struct fuse_inode {
        u64 nlookup;
        /** The request used for sending the FORGET message */
-        struct fuse_req *forget_req;
+        struct fuse_forget_link *forget;
        /** Time in jiffies until the file attributes are valid */
        u64 i_time;
@@ -255,7 +261,6 @@ struct fuse_req {
        /** Data for asynchronous requests */
        union {
-                struct fuse_forget_in forget_in;
                struct {
                        struct fuse_release_in in;
                        struct path path;
@@ -369,6 +374,13 @@ struct fuse_conn {
        /** Pending interrupts */
        struct list_head interrupts;
+        /** Queue of pending forgets */
+        struct fuse_forget_link forget_list_head;
+        struct fuse_forget_link *forget_list_tail;
+        /** Batching of FORGET requests (positive indicates FORGET batch) */
+        int forget_batch;
        /** Flag indicating if connection is blocked.  This will be
            the case before the INIT reply is received, and if there
            are too many outstading backgrounds requests */
@@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 /**
 * Send FORGET command
 */
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
-                      u64 nodeid, u64 nlookup);
+                       u64 nodeid, u64 nlookup);
+struct fuse_forget_link *fuse_alloc_forget(void);
 /**
 * Initialize READ or READDIR request
@@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 /**
- * Send a request with no reply
- */
-void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
-/**
 * Send a request in the background
 */
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a8b31da19b93..f62b32cffea9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,6 +71,11 @@ struct fuse_mount_data {
        unsigned blksize;
 };
+struct fuse_forget_link *fuse_alloc_forget()
+{
+        return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+}
 static struct inode *fuse_alloc_inode(struct super_block *sb)
 {
        struct inode *inode;
@@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&fi->queued_writes);
        INIT_LIST_HEAD(&fi->writepages);
        init_waitqueue_head(&fi->page_waitq);
-        fi->forget_req = fuse_request_alloc();
+        fi->forget = fuse_alloc_forget();
-        if (!fi->forget_req) {
+        if (!fi->forget) {
                kmem_cache_free(fuse_inode_cachep, inode);
                return NULL;
        }
@@ -111,24 +116,10 @@ static void fuse_destroy_inode(struct inode *inode)
        struct fuse_inode *fi = get_fuse_inode(inode);
        BUG_ON(!list_empty(&fi->write_files));
        BUG_ON(!list_empty(&fi->queued_writes));
-        if (fi->forget_req)
+        kfree(fi->forget);
-                fuse_request_free(fi->forget_req);
        call_rcu(&inode->i_rcu, fuse_i_callback);
 }
-void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
-                      u64 nodeid, u64 nlookup)
-{
-        struct fuse_forget_in *inarg = &req->misc.forget_in;
-        inarg->nlookup = nlookup;
-        req->in.h.opcode = FUSE_FORGET;
-        req->in.h.nodeid = nodeid;
-        req->in.numargs = 1;
-        req->in.args[0].size = sizeof(struct fuse_forget_in);
-        req->in.args[0].value = inarg;
-        fuse_request_send_noreply(fc, req);
-}
 static void fuse_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
@@ -136,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode)
        if (inode->i_sb->s_flags & MS_ACTIVE) {
                struct fuse_conn *fc = get_fuse_conn(inode);
                struct fuse_inode *fi = get_fuse_inode(inode);
-                fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
+                fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
-                fi->forget_req = NULL;
+                fi->forget = NULL;
        }
 }
@@ -541,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc)
        INIT_LIST_HEAD(&fc->interrupts);
        INIT_LIST_HEAD(&fc->bg_queue);
        INIT_LIST_HEAD(&fc->entry);
+        fc->forget_list_tail = &fc->forget_list_head;
        atomic_set(&fc->num_waiting, 0);
        fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
        fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8d3d2b4a0a7d..a79790c06275 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
 #define __INCORE_DOT_H__
 #include <linux/fs.h>
+#include <linux/kobject.h>
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f837ba953529..9e4686900f18 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
 #include <linux/bitops.h>
+#include <linux/ratelimit.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -93,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_inode_cache);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -827,7 +829,7 @@ static journal_t * journal_init_common (void)
        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
        if (!journal)
-                goto fail;
+                return NULL;
        init_waitqueue_head(&journal->j_wait_transaction_locked);
        init_waitqueue_head(&journal->j_wait_logspace);
@@ -852,14 +854,12 @@ static journal_t * journal_init_common (void)
        err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
        if (err) {
                kfree(journal);
-                goto fail;
+                return NULL;
        }
        spin_lock_init(&journal->j_history_lock);
        return journal;
-fail:
-        return NULL;
 }
 /* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -1982,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
        struct journal_head *ret;
-        static unsigned long last_warning;
 #ifdef CONFIG_JBD2_DEBUG
        atomic_inc(&nr_journal_heads);
@@ -1990,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void)
        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
        if (!ret) {
                jbd_debug(1, "out of memory for journal_head\n");
-                if (time_after(jiffies, last_warning + 5*HZ)) {
+                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
-                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-                               __func__);
-                        last_warning = jiffies;
-                }
                while (!ret) {
                        yield();
                        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -2292,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 #endif
-struct kmem_cache *jbd2_handle_cache;
+struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 static int __init journal_init_handle_cache(void)
 {
-        jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
+        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
-                                sizeof(handle_t),
-                                0,              /* offset */
-                                SLAB_TEMPORARY, /* flags */
-                                NULL);          /* ctor */
        if (jbd2_handle_cache == NULL) {
-                printk(KERN_EMERG "JBD: failed to create handle cache\n");
+                printk(KERN_EMERG "JBD2: failed to create handle cache\n");
+                return -ENOMEM;
+        }
+        jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+        if (jbd2_inode_cache == NULL) {
+                printk(KERN_EMERG "JBD2: failed to create inode cache\n");
+                kmem_cache_destroy(jbd2_handle_cache);
                return -ENOMEM;
        }
        return 0;
@@ -2312,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
 {
        if (jbd2_handle_cache)
                kmem_cache_destroy(jbd2_handle_cache);
+        if (jbd2_inode_cache)
+                kmem_cache_destroy(jbd2_inode_cache);
 }
 /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2bc4d5f116f1..1cad869494f0 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD2_DEBUG
                int dropped = info.end_transaction - 
                        be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
+#endif
                journal->j_transaction_sequence = ++info.end_transaction;
        }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6bf0a242613e..394893242ae3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -340,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
                jbd2_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
-                goto out;
        }
-out:
        return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -589,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        transaction = handle->h_transaction;
        journal = transaction->t_journal;
-        jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
        JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -774,7 +772,7 @@ done:
                J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
                            "Possible IO failure.\n");
                page = jh2bh(jh)->b_page;
-                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+                offset = offset_in_page(jh2bh(jh)->b_data);
                source = kmap_atomic(page, KM_USER0);
                /* Fire data frozen trigger just before we copy the data */
                jbd2_buffer_frozen_trigger(jh, source + offset,
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 97f6073ab339..ca58d64374ca 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_LOCKD) += lockd.o
-lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
+lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
-                svcproc.o svcsubs.o mon.o xdr.o grace.o
+                svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
-lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
 lockd-objs                    := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 000000000000..f848b52c67b1
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
+/*
+ * linux/fs/lockd/clnt4xdr.c
+ *
+ * XDR functions to encode/decode NLM version 4 RPC arguments and results.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM4_void_sz            (0)
+#define NLM4_cookie_sz          (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM4_caller_sz          (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_owner_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM4_fhandle_sz         (1+(NFS3_FHSIZE>>2))
+#define NLM4_lock_sz            (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
+#define NLM4_holder_sz          (6+NLM4_owner_sz)
+#define NLM4_testargs_sz        (NLM4_cookie_sz+1+NLM4_lock_sz)
+#define NLM4_lockargs_sz        (NLM4_cookie_sz+4+NLM4_lock_sz)
+#define NLM4_cancargs_sz        (NLM4_cookie_sz+2+NLM4_lock_sz)
+#define NLM4_unlockargs_sz      (NLM4_cookie_sz+NLM4_lock_sz)
+#define NLM4_testres_sz         (NLM4_cookie_sz+1+NLM4_holder_sz)
+#define NLM4_res_sz             (NLM4_cookie_sz+1)
+#define NLM4_norep_sz           (0)
+static s64 loff_t_to_s64(loff_t offset)
+{
+        s64 res;
+        if (offset >= NLM4_OFFSET_MAX)
+                res = NLM4_OFFSET_MAX;
+        else if (offset <= -NLM4_OFFSET_MAX)
+                res = -NLM4_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm4_compute_offsets(const struct nlm_lock *lock,
+                                 u64 *l_offset, u64 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s64(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv4 basic data types
+ *
+ * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
+ * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
+ * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                             struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size > NFS3_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, fh->size);
+}
+/*
+ *      enum nlm4_stats {
+ *              NLM4_GRANTED = 0,
+ *              NLM4_DENIED = 1,
+ *              NLM4_DENIED_NOLOCKS = 2,
+ *              NLM4_BLOCKED = 3,
+ *              NLM4_DENIED_GRACE_PERIOD = 4,
+ *              NLM4_DEADLCK = 5,
+ *              NLM4_ROFS = 6,
+ *              NLM4_STALE_FH = 7,
+ *              NLM4_FBIG = 8,
+ *              NLM4_FAILED = 9
+ *      };
+ *
+ *      struct nlm4_stat {
+ *              nlm4_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm4_stat(struct xdr_stream *xdr,
+                             const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm4_failed))
+                goto out_bad_xdr;
+        *stat = *p;
+        return 0;
+out_bad_xdr:
+        dprintk("%s: server returned invalid nlm4_stats value: %u\n",
+                        __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm4_holder {
+ *              bool    exclusive;
+ *              int32   svid;
+ *              netobj  oh;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_holder(struct xdr_stream *xdr,
+                               const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u64 l_offset, l_len;
+        u32 exclusive;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 8 + 8);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        p = xdr_decode_hyper(p, &l_offset);
+        xdr_decode_hyper(p, &l_len);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm4_lock {
+ *              string  caller_name<LM_MAXSTRLEN>;
+ *              netobj  fh;
+ *              netobj  oh;
+ *              int32   svid;
+ *              uint64  l_offset;
+ *              uint64  l_len;
+ *      };
+ */
+static void encode_nlm4_lock(struct xdr_stream *xdr,
+                             const struct nlm_lock *lock)
+{
+        u64 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 8 + 8);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm4_compute_offsets(lock, &l_offset, &l_len);
+        p = xdr_encode_hyper(p, l_offset);
+        xdr_encode_hyper(p, l_len);
+}
+/*
+ * NLMv4 XDR encode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm4_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm4_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_unlockargs {
+ *              netobj cookie;
+ *              struct nlm4_lock alock;
+ *      };
+ */
+static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm4_lock(xdr, lock);
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static void nlm4_xdr_enc_res(struct rpc_rqst *req,
+                             struct xdr_stream *xdr,
+                             const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+}
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm4_stat(xdr, result->status);
+        if (result->status == nlm_lck_denied)
+                encode_nlm4_holder(xdr, result);
+}
+/*
+ * NLMv4 XDR decode functions
+ *
+ * NLMv4 argument types are defined in Appendix II of RFC 1813:
+ * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm4_testrply switch (nlm4_stats stat) {
+ *      case NLM4_DENIED:
+ *              struct nlm4_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm4_testres {
+ *              netobj cookie;
+ *              nlm4_testrply test_stat;
+ *      };
+ */
+static int decode_nlm4_testrply(struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm4_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm4_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm4_res {
+ *              netobj cookie;
+ *              nlm4_stat stat;
+ *      };
+ */
+static int nlm4_xdr_dec_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm4_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm4_xdr_dec_norep      NULL
+#define PROC(proc, argtype, restype)                                    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm4_xdr_enc_##argtype,             \
+        .p_decode    = (kxdrdproc_t)nlm4_xdr_dec_##restype,             \
+        .p_arglen    = NLM4_##argtype##_sz,                             \
+        .p_replen    = NLM4_##restype##_sz,                             \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm4_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+struct rpc_version      nlm_version4 = {
+        .number         = 4,
+        .nrprocs        = ARRAY_SIZE(nlm4_procedures),
+        .procs          = nlm4_procedures,
+};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 25509eb28fd7..8d4ea8351e3d 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
 */
 void nlmclnt_done(struct nlm_host *host)
 {
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
 }
 EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -273,7 +273,7 @@ restart:
        spin_unlock(&nlm_blocked_lock);
        /* Release host handle after use */
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        lockd_down();
        return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 332c54cf75e0..adb45ec9038c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -58,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
                return;
        list_del(&lockowner->list);
        spin_unlock(&lockowner->host->h_lock);
-        nlm_release_host(lockowner->host);
+        nlmclnt_release_host(lockowner->host);
        kfree(lockowner);
 }
@@ -207,22 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
                printk("nlm_alloc_call: failed, waiting for memory\n");
                schedule_timeout_interruptible(5*HZ);
        }
-        nlm_release_host(host);
+        nlmclnt_release_host(host);
        return NULL;
 }
-void nlm_release_call(struct nlm_rqst *call)
+void nlmclnt_release_call(struct nlm_rqst *call)
 {
        if (!atomic_dec_and_test(&call->a_count))
                return;
-        nlm_release_host(call->a_host);
+        nlmclnt_release_host(call->a_host);
        nlmclnt_release_lockargs(call);
        kfree(call);
 }
 static void nlmclnt_rpc_release(void *data)
 {
-        nlm_release_call(data);
+        nlmclnt_release_call(data);
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -436,7 +436,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
                        status = nlm_stat_to_errno(req->a_res.status);
        }
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -593,7 +593,7 @@ again:
 out_unblock:
        nlmclnt_finish_block(block);
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 out_unlock:
        /* Fatal error: ensure that we remove the lock altogether */
@@ -694,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
        /* What to do now? I'm out of my depth... */
        status = -ENOLCK;
 out:
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
@@ -755,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
                        NLMPROC_CANCEL, &nlmclnt_cancel_ops);
        if (status == 0 && req->a_res.status == nlm_lck_denied)
                status = -ENOLCK;
-        nlm_release_call(req);
+        nlmclnt_release_call(req);
        return status;
 }
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 000000000000..180ac34feb9a
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
+/*
+ * linux/fs/lockd/clntxdr.c
+ *
+ * XDR functions to encode/decode NLM version 3 RPC arguments and results.
+ * NLM version 3 is backwards compatible with NLM versions 1 and 2.
+ *
+ * NLM client-side only.
+ *
+ * Copyright (C) 2010, Oracle.  All rights reserved.
+ */
+#include <linux/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/lockd/lockd.h>
+#define NLMDBG_FACILITY         NLMDBG_XDR
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+/*
+ * Declare the space requirements for NLM arguments and replies as
+ * number of 32bit-words
+ */
+#define NLM_cookie_sz           (1+(NLM_MAXCOOKIELEN>>2))
+#define NLM_caller_sz           (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_owner_sz            (1+(NLMCLNT_OHSIZE>>2))
+#define NLM_fhandle_sz          (1+(NFS2_FHSIZE>>2))
+#define NLM_lock_sz             (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
+#define NLM_holder_sz           (4+NLM_owner_sz)
+#define NLM_testargs_sz         (NLM_cookie_sz+1+NLM_lock_sz)
+#define NLM_lockargs_sz         (NLM_cookie_sz+4+NLM_lock_sz)
+#define NLM_cancargs_sz         (NLM_cookie_sz+2+NLM_lock_sz)
+#define NLM_unlockargs_sz       (NLM_cookie_sz+NLM_lock_sz)
+#define NLM_testres_sz          (NLM_cookie_sz+1+NLM_holder_sz)
+#define NLM_res_sz              (NLM_cookie_sz+1)
+#define NLM_norep_sz            (0)
+static s32 loff_t_to_s32(loff_t offset)
+{
+        s32 res;
+        if (offset >= NLM_OFFSET_MAX)
+                res = NLM_OFFSET_MAX;
+        else if (offset <= -NLM_OFFSET_MAX)
+                res = -NLM_OFFSET_MAX;
+        else
+                res = offset;
+        return res;
+}
+static void nlm_compute_offsets(const struct nlm_lock *lock,
+                                u32 *l_offset, u32 *l_len)
+{
+        const struct file_lock *fl = &lock->fl;
+        BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
+        BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
+                                fl->fl_end != OFFSET_MAX);
+        *l_offset = loff_t_to_s32(fl->fl_start);
+        if (fl->fl_end == OFFSET_MAX)
+                *l_len = 0;
+        else
+                *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("lockd: %s prematurely hit the end of our receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
+/*
+ * Encode/decode NLMv3 basic data types
+ *
+ * Basic NLMv3 data types are not defined in an IETF standards
+ * document.  X/Open has a description of these data types that
+ * is useful.  See Chapter 10 of "Protocols for Interworking:
+ * XNFS, Version 3W".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+static void encode_bool(struct xdr_stream *xdr, const int value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = value ? xdr_one : xdr_zero;
+}
+static void encode_int32(struct xdr_stream *xdr, const s32 value)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
+}
+/*
+ *      typedef opaque netobj<MAXNETOBJ_SZ>
+ */
+static void encode_netobj(struct xdr_stream *xdr,
+                          const u8 *data, const unsigned int length)
+{
+        __be32 *p;
+        BUG_ON(length > XDR_MAX_NETOBJ);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, data, length);
+}
+static int decode_netobj(struct xdr_stream *xdr,
+                         struct xdr_netobj *obj)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > XDR_MAX_NETOBJ))
+                goto out_size;
+        obj->len = length;
+        obj->data = (u8 *)p;
+        return 0;
+out_size:
+        dprintk("NFS: returned netobj was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj cookie;
+ */
+static void encode_cookie(struct xdr_stream *xdr,
+                          const struct nlm_cookie *cookie)
+{
+        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
+        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
+}
+static int decode_cookie(struct xdr_stream *xdr,
+                         struct nlm_cookie *cookie)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        /* apparently HPUX can return empty cookies */
+        if (length == 0)
+                goto out_hpux;
+        if (length > NLM_MAXCOOKIELEN)
+                goto out_size;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        cookie->len = length;
+        memcpy(cookie->data, p, length);
+        return 0;
+out_hpux:
+        cookie->len = 4;
+        memset(cookie->data, 0, 4);
+        return 0;
+out_size:
+        dprintk("NFS: returned cookie was too long: %u\n", length);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      netobj fh;
+ */
+static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        BUG_ON(fh->size != NFS2_FHSIZE);
+        encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
+}
+/*
+ *      enum nlm_stats {
+ *              LCK_GRANTED = 0,
+ *              LCK_DENIED = 1,
+ *              LCK_DENIED_NOLOCKS = 2,
+ *              LCK_BLOCKED = 3,
+ *              LCK_DENIED_GRACE_PERIOD = 4
+ *      };
+ *
+ *
+ *      struct nlm_stat {
+ *              nlm_stats stat;
+ *      };
+ *
+ * NB: we don't swap bytes for the NLM status values.  The upper
+ * layers deal directly with the status value in network byte
+ * order.
+ */
+static void encode_nlm_stat(struct xdr_stream *xdr,
+                            const __be32 stat)
+{
+        __be32 *p;
+        BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
+        p = xdr_reserve_space(xdr, 4);
+        *p = stat;
+}
+static int decode_nlm_stat(struct xdr_stream *xdr,
+                           __be32 *stat)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (unlikely(*p > nlm_lck_denied_grace_period))
+                goto out_enum;
+        *stat = *p;
+        return 0;
+out_enum:
+        dprintk("%s: server returned invalid nlm_stats value: %u\n",
+                __func__, be32_to_cpup(p));
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      struct nlm_holder {
+ *              bool exclusive;
+ *              int uppid;
+ *              netobj oh;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_holder(struct xdr_stream *xdr,
+                              const struct nlm_res *result)
+{
+        const struct nlm_lock *lock = &result->lock;
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+        encode_int32(xdr, lock->svid);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
+{
+        struct nlm_lock *lock = &result->lock;
+        struct file_lock *fl = &lock->fl;
+        u32 exclusive, l_offset, l_len;
+        int error;
+        __be32 *p;
+        s32 end;
+        memset(lock, 0, sizeof(*lock));
+        locks_init_lock(fl);
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        exclusive = be32_to_cpup(p++);
+        lock->svid = be32_to_cpup(p);
+        fl->fl_pid = (pid_t)lock->svid;
+        error = decode_netobj(xdr, &lock->oh);
+        if (unlikely(error))
+                goto out;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fl->fl_flags = FL_POSIX;
+        fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+        l_offset = be32_to_cpup(p++);
+        l_len = be32_to_cpup(p);
+        end = l_offset + l_len - 1;
+        fl->fl_start = (loff_t)l_offset;
+        if (l_len == 0 || end < 0)
+                fl->fl_end = OFFSET_MAX;
+        else
+                fl->fl_end = (loff_t)end;
+        error = 0;
+out:
+        return error;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      string caller_name<LM_MAXSTRLEN>;
+ */
+static void encode_caller_name(struct xdr_stream *xdr, const char *name)
+{
+        /* NB: client-side does not set lock->len */
+        u32 length = strlen(name);
+        __be32 *p;
+        BUG_ON(length > NLM_MAXSTRLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+/*
+ *      struct nlm_lock {
+ *              string caller_name<LM_MAXSTRLEN>;
+ *              netobj fh;
+ *              netobj oh;
+ *              int uppid;
+ *              unsigned l_offset;
+ *              unsigned l_len;
+ *      };
+ */
+static void encode_nlm_lock(struct xdr_stream *xdr,
+                            const struct nlm_lock *lock)
+{
+        u32 l_offset, l_len;
+        __be32 *p;
+        encode_caller_name(xdr, lock->caller);
+        encode_fh(xdr, &lock->fh);
+        encode_netobj(xdr, lock->oh.data, lock->oh.len);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
+        *p++ = cpu_to_be32(lock->svid);
+        nlm_compute_offsets(lock, &l_offset, &l_len);
+        *p++ = cpu_to_be32(l_offset);
+        *p   = cpu_to_be32(l_len);
+}
+/*
+ * NLMv3 XDR encode functions
+ *
+ * NLMv3 argument types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      struct nlm_testargs {
+ *              netobj cookie;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_lockargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *              bool reclaim;
+ *              int state;
+ *      };
+ */
+static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+        encode_bool(xdr, args->reclaim);
+        encode_int32(xdr, args->state);
+}
+/*
+ *      struct nlm_cancargs {
+ *              netobj cookie;
+ *              bool block;
+ *              bool exclusive;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_bool(xdr, args->block);
+        encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_unlockargs {
+ *              netobj cookie;
+ *              struct nlm_lock alock;
+ *      };
+ */
+static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nlm_args *args)
+{
+        const struct nlm_lock *lock = &args->lock;
+        encode_cookie(xdr, &args->cookie);
+        encode_nlm_lock(xdr, lock);
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static void nlm_xdr_enc_res(struct rpc_rqst *req,
+                            struct xdr_stream *xdr,
+                            const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+}
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static void encode_nlm_testrply(struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        if (result->status == nlm_lck_denied)
+                encode_nlm_holder(xdr, result);
+}
+static void nlm_xdr_enc_testres(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                const struct nlm_res *result)
+{
+        encode_cookie(xdr, &result->cookie);
+        encode_nlm_stat(xdr, result->status);
+        encode_nlm_testrply(xdr, result);
+}
+/*
+ * NLMv3 XDR decode functions
+ *
+ * NLMv3 result types are defined in Chapter 10 of The Open Group's
+ * "Protocols for Interworking: XNFS, Version 3W".
+ */
+/*
+ *      union nlm_testrply switch (nlm_stats stat) {
+ *      case LCK_DENIED:
+ *              struct nlm_holder holder;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct nlm_testres {
+ *              netobj cookie;
+ *              nlm_testrply test_stat;
+ *      };
+ */
+static int decode_nlm_testrply(struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_nlm_stat(xdr, &result->status);
+        if (unlikely(error))
+                goto out;
+        if (result->status == nlm_lck_denied)
+                error = decode_nlm_holder(xdr, result);
+out:
+        return error;
+}
+static int nlm_xdr_dec_testres(struct rpc_rqst *req,
+                               struct xdr_stream *xdr,
+                               struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_testrply(xdr, result);
+out:
+        return error;
+}
+/*
+ *      struct nlm_res {
+ *              netobj cookie;
+ *              nlm_stat stat;
+ *      };
+ */
+static int nlm_xdr_dec_res(struct rpc_rqst *req,
+                           struct xdr_stream *xdr,
+                           struct nlm_res *result)
+{
+        int error;
+        error = decode_cookie(xdr, &result->cookie);
+        if (unlikely(error))
+                goto out;
+        error = decode_nlm_stat(xdr, &result->status);
+out:
+        return error;
+}
+/*
+ * For NLM, a void procedure really returns nothing
+ */
+#define nlm_xdr_dec_norep       NULL
+#define PROC(proc, argtype, restype)    \
+[NLMPROC_##proc] = {                                                    \
+        .p_proc      = NLMPROC_##proc,                                  \
+        .p_encode    = (kxdreproc_t)nlm_xdr_enc_##argtype,              \
+        .p_decode    = (kxdrdproc_t)nlm_xdr_dec_##restype,              \
+        .p_arglen    = NLM_##argtype##_sz,                              \
+        .p_replen    = NLM_##restype##_sz,                              \
+        .p_statidx   = NLMPROC_##proc,                                  \
+        .p_name      = #proc,                                           \
+        }
+static struct rpc_procinfo      nlm_procedures[] = {
+        PROC(TEST,              testargs,       testres),
+        PROC(LOCK,              lockargs,       res),
+        PROC(CANCEL,            cancargs,       res),
+        PROC(UNLOCK,            unlockargs,     res),
+        PROC(GRANTED,           testargs,       res),
+        PROC(TEST_MSG,          testargs,       norep),
+        PROC(LOCK_MSG,          lockargs,       norep),
+        PROC(CANCEL_MSG,        cancargs,       norep),
+        PROC(UNLOCK_MSG,        unlockargs,     norep),
+        PROC(GRANTED_MSG,       testargs,       norep),
+        PROC(TEST_RES,          testres,        norep),
+        PROC(LOCK_RES,          res,            norep),
+        PROC(CANCEL_RES,        res,            norep),
+        PROC(UNLOCK_RES,        res,            norep),
+        PROC(GRANTED_RES,       res,            norep),
+};
+static struct rpc_version       nlm_version1 = {
+                .number         = 1,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       nlm_version3 = {
+                .number         = 3,
+                .nrprocs        = ARRAY_SIZE(nlm_procedures),
+                .procs          = nlm_procedures,
+};
+static struct rpc_version       *nlm_versions[] = {
+        [1] = &nlm_version1,
+        [3] = &nlm_version3,
+#ifdef CONFIG_LOCKD_V4
+        [4] = &nlm_version4,
+#endif
+};
+static struct rpc_stat          nlm_rpc_stats;
+struct rpc_program              nlm_program = {
+                .name           = "lockd",
+                .number         = NLM_PROGRAM,
+                .nrvers         = ARRAY_SIZE(nlm_versions),
+                .version        = nlm_versions,
+                .stats          = &nlm_rpc_stats,
+};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index ed0c59fe23ce..5f1bcb2f06f3 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -25,9 +25,22 @@
 #define NLM_HOST_EXPIRE         (300 * HZ)
 #define NLM_HOST_COLLECT        (120 * HZ)
-static struct hlist_head        nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_server_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_client_hosts[NLM_HOST_NRHASH];
+#define for_each_host(host, pos, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry((host), (pos), (chain), h_hash)
+#define for_each_host_safe(host, pos, next, chain, table) \
+        for ((chain) = (table); \
+             (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
+                hlist_for_each_entry_safe((host), (pos), (next), \
+                                                (chain), h_hash)
 static unsigned long            next_gc;
-static int                      nrhosts;
+static unsigned long            nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
@@ -40,8 +53,6 @@ struct nlm_lookup_host_info {
        const u32               version;        /* NLM version to search for */
        const char              *hostname;      /* remote's hostname */
        const size_t            hostname_len;   /* it's length */
-        const struct sockaddr   *src_sap;       /* our address (optional) */
-        const size_t            src_len;        /* it's length */
        const int               noresvport;     /* use non-priv port */
 };
@@ -88,127 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
 }
 /*
- * Common host lookup routine for server & client
+ * Allocate and initialize an nlm_host.  Common to both client and server.
 */
-static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
+static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
+                                       struct nsm_handle *nsm)
 {
-        struct hlist_head *chain;
+        struct nlm_host *host = NULL;
-        struct hlist_node *pos;
+        unsigned long now = jiffies;
-        struct nlm_host *host;
-        struct nsm_handle *nsm = NULL;
-        mutex_lock(&nlm_host_mutex);
-        if (time_after_eq(jiffies, next_gc))
+        if (nsm != NULL)
-                nlm_gc_hosts();
-        /* We may keep several nlm_host objects for a peer, because each
-         * nlm_host is identified by
-         * (address, protocol, version, server/client)
-         * We could probably simplify this a little by putting all those
-         * different NLM rpc_clients into one single nlm_host object.
-         * This would allow us to have one nlm_host per address.
-         */
-        chain = &nlm_hosts[nlm_hash_address(ni->sap)];
-        hlist_for_each_entry(host, pos, chain, h_hash) {
-                if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
-                        continue;
-                /* See if we have an NSM handle for this client */
-                if (!nsm)
-                        nsm = host->h_nsmhandle;
-                if (host->h_proto != ni->protocol)
-                        continue;
-                if (host->h_version != ni->version)
-                        continue;
-                if (host->h_server != ni->server)
-                        continue;
-                if (ni->server && ni->src_len != 0 &&
-                    !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
-                        continue;
-                /* Move to head of hash chain. */
-                hlist_del(&host->h_hash);
-                hlist_add_head(&host->h_hash, chain);
-                nlm_get_host(host);
-                dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
-                                host->h_name, host->h_addrbuf);
-                goto out;
-        }
-        /*
-         * The host wasn't in our hash table.  If we don't
-         * have an NSM handle for it yet, create one.
-         */
-        if (nsm)
                atomic_inc(&nsm->sm_count);
        else {
                host = NULL;
                nsm = nsm_get_handle(ni->sap, ni->salen,
                                        ni->hostname, ni->hostname_len);
-                if (!nsm) {
+                if (unlikely(nsm == NULL)) {
-                        dprintk("lockd: nlm_lookup_host failed; "
+                        dprintk("lockd: %s failed; no nsm handle\n",
-                                "no nsm handle\n");
+                                __func__);
                        goto out;
                }
        }
-        host = kzalloc(sizeof(*host), GFP_KERNEL);
+        host = kmalloc(sizeof(*host), GFP_KERNEL);
-        if (!host) {
+        if (unlikely(host == NULL)) {
+                dprintk("lockd: %s failed; no memory\n", __func__);
                nsm_release(nsm);
-                dprintk("lockd: nlm_lookup_host failed; no memory\n");
                goto out;
        }
-        host->h_name       = nsm->sm_name;
-        host->h_addrbuf    = nsm->sm_addrbuf;
        memcpy(nlm_addr(host), ni->sap, ni->salen);
-        host->h_addrlen = ni->salen;
+        host->h_addrlen    = ni->salen;
        rpc_set_port(nlm_addr(host), 0);
-        memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+        host->h_srcaddrlen = 0;
-        host->h_srcaddrlen = ni->src_len;
+        host->h_rpcclnt    = NULL;
+        host->h_name       = nsm->sm_name;
        host->h_version    = ni->version;
        host->h_proto      = ni->protocol;
-        host->h_rpcclnt    = NULL;
+        host->h_reclaiming = 0;
-        mutex_init(&host->h_mutex);
+        host->h_server     = ni->server;
-        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
+        host->h_noresvport = ni->noresvport;
-        host->h_expires    = jiffies + NLM_HOST_EXPIRE;
+        host->h_inuse      = 0;
-        atomic_set(&host->h_count, 1);
        init_waitqueue_head(&host->h_gracewait);
        init_rwsem(&host->h_rwsem);
-        host->h_state      = 0;                 /* pseudo NSM state */
+        host->h_state      = 0;
-        host->h_nsmstate   = 0;                 /* real NSM state */
+        host->h_nsmstate   = 0;
-        host->h_nsmhandle  = nsm;
+        host->h_pidcount   = 0;
-        host->h_server     = ni->server;
+        atomic_set(&host->h_count, 1);
-        host->h_noresvport = ni->noresvport;
+        mutex_init(&host->h_mutex);
-        hlist_add_head(&host->h_hash, chain);
+        host->h_nextrebind = now + NLM_HOST_REBIND;
+        host->h_expires    = now + NLM_HOST_EXPIRE;
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
        INIT_LIST_HEAD(&host->h_granted);
        INIT_LIST_HEAD(&host->h_reclaim);
+        host->h_nsmhandle  = nsm;
-        nrhosts++;
+        host->h_addrbuf    = nsm->sm_addrbuf;
-        dprintk("lockd: nlm_lookup_host created host %s\n",
-                        host->h_name);
 out:
-        mutex_unlock(&nlm_host_mutex);
        return host;
 }
 /*
- * Destroy a host
+ * Destroy an nlm_host and free associated resources
+ *
+ * Caller must hold nlm_host_mutex.
 */
-static void
+static void nlm_destroy_host_locked(struct nlm_host *host)
-nlm_destroy_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
+        dprintk("lockd: destroy host %s\n", host->h_name);
        BUG_ON(!list_empty(&host->h_lockowners));
        BUG_ON(atomic_read(&host->h_count));
+        hlist_del_init(&host->h_hash);
        nsm_unmonitor(host);
        nsm_release(host->h_nsmhandle);
@@ -216,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host)
        if (clnt != NULL)
                rpc_shutdown_client(clnt);
        kfree(host);
+        nrhosts--;
 }
 /**
@@ -249,12 +218,76 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .hostname_len   = strlen(hostname),
                .noresvport     = noresvport,
        };
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host;
+        struct nsm_handle *nsm = NULL;
        dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
                        (hostname ? hostname : "<none>"), version,
                        (protocol == IPPROTO_UDP ? "udp" : "tcp"));
-        return nlm_lookup_host(&ni);
+        mutex_lock(&nlm_host_mutex);
+        chain = &nlm_client_hosts[nlm_hash_address(sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != protocol)
+                        continue;
+                if (host->h_version != version)
+                        continue;
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n", __func__,
+                        host->h_name, host->h_addrbuf);
+                goto out;
+        }
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n", __func__,
+                host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmclnt_release_host - release client nlm_host
+ * @host: nlm_host to release
+ *
+ */
+void nlmclnt_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release client host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(host->h_server);
+        if (atomic_dec_and_test(&host->h_count)) {
+                BUG_ON(!list_empty(&host->h_lockowners));
+                BUG_ON(!list_empty(&host->h_granted));
+                BUG_ON(!list_empty(&host->h_reclaim));
+                mutex_lock(&nlm_host_mutex);
+                nlm_destroy_host_locked(host);
+                mutex_unlock(&nlm_host_mutex);
+        }
 }
 /**
@@ -279,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                                    const char *hostname,
                                    const size_t hostname_len)
 {
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nlm_host *host = NULL;
+        struct nsm_handle *nsm = NULL;
        struct sockaddr_in sin = {
                .sin_family     = AF_INET,
        };
        struct sockaddr_in6 sin6 = {
                .sin6_family    = AF_INET6,
        };
+        struct sockaddr *src_sap;
+        size_t src_len = rqstp->rq_addrlen;
        struct nlm_lookup_host_info ni = {
                .server         = 1,
                .sap            = svc_addr(rqstp),
@@ -293,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
                .version        = rqstp->rq_vers,
                .hostname       = hostname,
                .hostname_len   = hostname_len,
-                .src_len        = rqstp->rq_addrlen,
        };
        dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
                        (int)hostname_len, hostname, rqstp->rq_vers,
                        (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+        mutex_lock(&nlm_host_mutex);
        switch (ni.sap->sa_family) {
        case AF_INET:
                sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
-                ni.src_sap = (struct sockaddr *)&sin;
+                src_sap = (struct sockaddr *)&sin;
                break;
        case AF_INET6:
                ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
-                ni.src_sap = (struct sockaddr *)&sin6;
+                src_sap = (struct sockaddr *)&sin6;
                break;
        default:
-                return NULL;
+                dprintk("lockd: %s failed; unrecognized address family\n",
+                        __func__);
+                goto out;
+        }
+        if (time_after_eq(jiffies, next_gc))
+                nlm_gc_hosts();
+        chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
+                        continue;
+                /* Same address. Share an NSM handle if we already have one */
+                if (nsm == NULL)
+                        nsm = host->h_nsmhandle;
+                if (host->h_proto != ni.protocol)
+                        continue;
+                if (host->h_version != ni.version)
+                        continue;
+                if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
+                        continue;
+                /* Move to head of hash chain. */
+                hlist_del(&host->h_hash);
+                hlist_add_head(&host->h_hash, chain);
+                nlm_get_host(host);
+                dprintk("lockd: %s found host %s (%s)\n",
+                        __func__, host->h_name, host->h_addrbuf);
+                goto out;
        }
-        return nlm_lookup_host(&ni);
+        host = nlm_alloc_host(&ni, nsm);
+        if (unlikely(host == NULL))
+                goto out;
+        memcpy(nlm_srcaddr(host), src_sap, src_len);
+        host->h_srcaddrlen = src_len;
+        hlist_add_head(&host->h_hash, chain);
+        nrhosts++;
+        dprintk("lockd: %s created host %s (%s)\n",
+                __func__, host->h_name, host->h_addrbuf);
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
+}
+/**
+ * nlmsvc_release_host - release server nlm_host
+ * @host: nlm_host to release
+ *
+ * Host is destroyed later in nlm_gc_host().
+ */
+void nlmsvc_release_host(struct nlm_host *host)
+{
+        if (host == NULL)
+                return;
+        dprintk("lockd: release server host %s\n", host->h_name);
+        BUG_ON(atomic_read(&host->h_count) < 0);
+        BUG_ON(!host->h_server);
+        atomic_dec(&host->h_count);
 }
 /*
@@ -413,20 +516,28 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
        return host;
 }
-/*
+static struct nlm_host *next_host_state(struct hlist_head *cache,
- * Release NLM host after use
+                                        struct nsm_handle *nsm,
- */
+                                        const struct nlm_reboot *info)
-void nlm_release_host(struct nlm_host *host)
 {
-        if (host != NULL) {
+        struct nlm_host *host = NULL;
-                dprintk("lockd: release host %s\n", host->h_name);
+        struct hlist_head *chain;
-                BUG_ON(atomic_read(&host->h_count) < 0);
+        struct hlist_node *pos;
-                if (atomic_dec_and_test(&host->h_count)) {
-                        BUG_ON(!list_empty(&host->h_lockowners));
+        mutex_lock(&nlm_host_mutex);
-                        BUG_ON(!list_empty(&host->h_granted));
+        for_each_host(host, pos, chain, cache) {
-                        BUG_ON(!list_empty(&host->h_reclaim));
+                if (host->h_nsmhandle == nsm
+                    && host->h_nsmstate != info->state) {
+                        host->h_nsmstate = info->state;
+                        host->h_state++;
+                        nlm_get_host(host);
+                        goto out;
                }
        }
+out:
+        mutex_unlock(&nlm_host_mutex);
+        return host;
 }
 /**
@@ -438,8 +549,6 @@ void nlm_release_host(struct nlm_host *host)
 */
 void nlm_host_rebooted(const struct nlm_reboot *info)
 {
-        struct hlist_head *chain;
-        struct hlist_node *pos;
        struct nsm_handle *nsm;
        struct nlm_host *host;
@@ -452,32 +561,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
         * lock for this.
         * To avoid processing a host several times, we match the nsmstate.
         */
-again:  mutex_lock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                nlmsvc_free_host_resources(host);
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                nlmsvc_release_host(host);
-                        if (host->h_nsmhandle == nsm
-                         && host->h_nsmstate != info->state) {
-                                host->h_nsmstate = info->state;
-                                host->h_state++;
-                                nlm_get_host(host);
-                                mutex_unlock(&nlm_host_mutex);
-                                if (host->h_server) {
-                                        /* We're server for this guy, just ditch
-                                         * all the locks he held. */
-                                        nlmsvc_free_host_resources(host);
-                                } else {
-                                        /* He's the server, initiate lock recovery. */
-                                        nlmclnt_recovery(host);
-                                }
-                                nlm_release_host(host);
-                                goto again;
-                        }
-                }
        }
-        mutex_unlock(&nlm_host_mutex);
+        while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
+                nlmclnt_recovery(host);
+                nlmclnt_release_host(host);
+        }
        nsm_release(nsm);
 }
@@ -497,13 +589,11 @@ nlm_shutdown_hosts(void)
        /* First, make all hosts eligible for gc */
        dprintk("lockd: nuking all hosts...\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts) {
-                hlist_for_each_entry(host, pos, chain, h_hash) {
+                host->h_expires = jiffies - 1;
-                        host->h_expires = jiffies - 1;
+                if (host->h_rpcclnt) {
-                        if (host->h_rpcclnt) {
+                        rpc_shutdown_client(host->h_rpcclnt);
-                                rpc_shutdown_client(host->h_rpcclnt);
+                        host->h_rpcclnt = NULL;
-                                host->h_rpcclnt = NULL;
-                        }
                }
        }
@@ -512,15 +602,13 @@ nlm_shutdown_hosts(void)
        mutex_unlock(&nlm_host_mutex);
        /* complain if any hosts are left */
-        if (nrhosts) {
+        if (nrhosts != 0) {
                printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
-                dprintk("lockd: %d hosts left:\n", nrhosts);
+                dprintk("lockd: %lu hosts left:\n", nrhosts);
-                for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                for_each_host(host, pos, chain, nlm_server_hosts) {
-                        hlist_for_each_entry(host, pos, chain, h_hash) {
+                        dprintk("       %s (cnt %d use %d exp %ld)\n",
-                                dprintk("       %s (cnt %d use %d exp %ld)\n",
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_inuse, host->h_expires);
-                                        host->h_inuse, host->h_expires);
-                        }
                }
        }
 }
@@ -538,29 +626,22 @@ nlm_gc_hosts(void)
        struct nlm_host *host;
        dprintk("lockd: host garbage collection\n");
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host(host, pos, chain, nlm_server_hosts)
-                hlist_for_each_entry(host, pos, chain, h_hash)
+                host->h_inuse = 0;
-                        host->h_inuse = 0;
-        }
        /* Mark all hosts that hold locks, blocks or shares */
        nlmsvc_mark_resources();
-        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+        for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
-                hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
+                if (atomic_read(&host->h_count) || host->h_inuse
-                        if (atomic_read(&host->h_count) || host->h_inuse
+                 || time_before(jiffies, host->h_expires)) {
-                         || time_before(jiffies, host->h_expires)) {
+                        dprintk("nlm_gc_hosts skipping %s "
-                                dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
+                                "(cnt %d use %d exp %ld)\n",
-                                        host->h_name, atomic_read(&host->h_count),
+                                host->h_name, atomic_read(&host->h_count),
-                                        host->h_inuse, host->h_expires);
+                                host->h_inuse, host->h_expires);
-                                continue;
+                        continue;
-                        }
-                        dprintk("lockd: delete host %s\n", host->h_name);
-                        hlist_del_init(&host->h_hash);
-                        nlm_destroy_host(host);
-                        nrhosts--;
                }
+                nlm_destroy_host_locked(host);
        }
        next_gc = jiffies + NLM_HOST_COLLECT;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e0c918949644..23d7451b2938 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -401,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm)
 * Status Monitor wire protocol.
 */
-static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
+static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
        const u32 len = strlen(string);
        __be32 *p;
-        if (unlikely(len > SM_MAXSTRLEN))
+        BUG_ON(len > SM_MAXSTRLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, string, len);
-        return 0;
 }
 /*
 * "mon_name" specifies the host to be monitored.
 */
-static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        return encode_nsm_string(xdr, argp->mon_name);
+        encode_nsm_string(xdr, argp->mon_name);
 }
 /*
@@ -429,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
 * has changed.
 */
-static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
        __be32 *p;
-        status = encode_nsm_string(xdr, utsname()->nodename);
+        encode_nsm_string(xdr, utsname()->nodename);
-        if (unlikely(status != 0))
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-                return status;
+        *p++ = cpu_to_be32(argp->prog);
-        p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+        *p++ = cpu_to_be32(argp->vers);
-        if (unlikely(p == NULL))
+        *p = cpu_to_be32(argp->proc);
-                return -EIO;
-        *p++ = htonl(argp->prog);
-        *p++ = htonl(argp->vers);
-        *p++ = htonl(argp->proc);
-        return 0;
 }
 /*
 * The "mon_id" argument specifies the non-private arguments
 * of an NSMPROC_MON or NSMPROC_UNMON call.
 */
-static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        int status;
+        encode_mon_name(xdr, argp);
+        encode_my_id(xdr, argp);
-        status = encode_mon_name(xdr, argp);
-        if (unlikely(status != 0))
-                return status;
-        return encode_my_id(xdr, argp);
 }
 /*
@@ -465,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 * by the NSMPROC_MON call. This information will be supplied in the
 * NLMPROC_SM_NOTIFY call.
 */
-static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
+static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
        __be32 *p;
        p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
-        return 0;
 }
-static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                       const struct nsm_args *argp)
+                            const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        int status;
+        encode_priv(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        status = encode_mon_id(&xdr, argp);
-        if (unlikely(status))
-                return status;
-        return encode_priv(&xdr, argp);
 }
-static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
+static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
-                         const struct nsm_args *argp)
+                              const struct nsm_args *argp)
 {
-        struct xdr_stream xdr;
+        encode_mon_id(xdr, argp);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mon_id(&xdr, argp);
 }
-static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
-                            struct nsm_res *resp)
+                                struct xdr_stream *xdr,
+                                struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->status = ntohl(*p++);
+        resp->status = be32_to_cpup(p++);
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
+        dprintk("lockd: %s status %d state %d\n",
-                        resp->status, resp->state);
+                __func__, resp->status, resp->state);
        return 0;
 }
-static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
+static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
-                        struct nsm_res *resp)
+                            struct xdr_stream *xdr,
+                            struct nsm_res *resp)
 {
-        struct xdr_stream xdr;
+        __be32 *p;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(xdr, 4);
-        p = xdr_inline_decode(&xdr, sizeof(u32));
        if (unlikely(p == NULL))
                return -EIO;
-        resp->state = ntohl(*p);
+        resp->state = be32_to_cpup(p);
-        dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
+        dprintk("lockd: %s state %d\n", __func__, resp->state);
        return 0;
 }
@@ -542,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
 static struct rpc_procinfo      nsm_procedures[] = {
 [NSMPROC_MON] = {
                .p_proc         = NSMPROC_MON,
-                .p_encode       = (kxdrproc_t)xdr_enc_mon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_mon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat_res,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat_res,
                .p_arglen       = SM_mon_sz,
                .p_replen       = SM_monres_sz,
                .p_statidx      = NSMPROC_MON,
@@ -551,8 +525,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
        },
 [NSMPROC_UNMON] = {
                .p_proc         = NSMPROC_UNMON,
-                .p_encode       = (kxdrproc_t)xdr_enc_unmon,
+                .p_encode       = (kxdreproc_t)nsm_xdr_enc_unmon,
-                .p_decode       = (kxdrproc_t)xdr_dec_stat,
+                .p_decode       = (kxdrdproc_t)nsm_xdr_dec_stat,
                .p_arglen       = SM_mon_id_sz,
                .p_replen       = SM_unmonres_sz,
                .p_statidx      = NSMPROC_UNMON,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 38d261192453..9a41fdc19511 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -51,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;   
        return nlm_lck_denied_nolocks;
@@ -92,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -134,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -164,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -197,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unlock(file, &argp->lock);
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -229,7 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 static void nlm4svc_callback_release(void *data)
 {
-        nlm_release_call(data);
+        nlmsvc_release_call(data);
 }
 static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -261,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -334,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_share_file(host, file, argp);
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -367,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = nlmsvc_unshare_file(host, file, argp);
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -399,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ef5659b211e9..6e31695d046f 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -46,6 +46,7 @@ static void	nlmsvc_remove_block(struct nlm_block *block);
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
 static const struct rpc_call_ops nlmsvc_grant_ops;
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
 /*
 * The list of blocked locks to retry
@@ -233,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 failed_free:
        kfree(block);
 failed:
-        nlm_release_call(call);
+        nlmsvc_release_call(call);
        return NULL;
 }
@@ -266,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
        mutex_unlock(&file->f_mutex);
        nlmsvc_freegrantargs(block->b_call);
-        nlm_release_call(block->b_call);
+        nlmsvc_release_call(block->b_call);
        nlm_release_file(block->b_file);
        kfree(block->b_fl);
        kfree(block);
@@ -934,3 +935,32 @@ nlmsvc_retry_blocked(void)
        return timeout;
 }
+#ifdef RPC_DEBUG
+static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
+{
+        /*
+         * We can get away with a static buffer because we're only
+         * called with BKL held.
+         */
+        static char buf[2*NLM_MAXCOOKIELEN+1];
+        unsigned int i, len = sizeof(buf);
+        char *p = buf;
+        len--;  /* allow for trailing \0 */
+        if (len < 3)
+                return "???";
+        for (i = 0 ; i < cookie->len ; i++) {
+                if (len < 2) {
+                        strcpy(p-3, "...");
+                        break;
+                }
+                sprintf(p, "%02x", cookie->data[i]);
+                p += 2;
+                len -= 2;
+        }
+        *p = '\0';
+        return buf;
+}
+#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0caea5310ac3..d27aab11f324 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -80,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
        return 0;
 no_locks:
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        if (error)
                return error;
        return nlm_lck_denied_nolocks;
@@ -122,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
                dprintk("lockd: TEST          status %d vers %d\n",
                        ntohl(resp->status), rqstp->rq_vers);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -164,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        else
                dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rc;
 }
@@ -194,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
        dprintk("lockd: CANCEL        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -227,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
        dprintk("lockd: UNLOCK        status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -257,9 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
                        -task->tk_status);
 }
+void nlmsvc_release_call(struct nlm_rqst *call)
+{
+        if (!atomic_dec_and_test(&call->a_count))
+                return;
+        nlmsvc_release_host(call->a_host);
+        kfree(call);
+}
 static void nlmsvc_callback_release(void *data)
 {
-        nlm_release_call(data);
+        nlmsvc_release_call(data);
 }
 static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -291,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
        stat = func(rqstp, argp, &call->a_res);
        if (stat != 0) {
-                nlm_release_call(call);
+                nlmsvc_release_call(call);
                return stat;
        }
@@ -366,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_share_file(host, file, argp));
        dprintk("lockd: SHARE         status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -399,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
        dprintk("lockd: UNSHARE       status %d\n", ntohl(resp->status));
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        nlm_release_file(file);
        return rpc_success;
 }
@@ -431,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                return rpc_success;
        nlmsvc_free_host_resources(host);
-        nlm_release_host(host);
+        nlmsvc_release_host(host);
        return rpc_success;
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index b583ab0a4cbb..964666c68a86 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s32                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm_encode_fh(p, &lock->fh))
-         || !(p = nlm_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM_OFFSET_MAX
-         || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        start = loff_t_to_s32(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
-        *p++ = htonl(lock->svid);
-        *p++ = htonl(start);
-        *p++ = htonl(len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                s32                     start, len, end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                start = ntohl(*p++);
-                len = ntohl(*p++);
-                end = start + len - 1;
-                fl->fl_start = s32_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s32_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM_void_sz             0
-#define NLM_cookie_sz           1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_owner_sz            1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_fhandle_sz          1+XDR_QUADLEN(NFS2_FHSIZE)
-#define NLM_lock_sz             3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
-#define NLM_holder_sz           4+NLM_owner_sz
-#define NLM_testargs_sz         NLM_cookie_sz+1+NLM_lock_sz
-#define NLM_lockargs_sz         NLM_cookie_sz+4+NLM_lock_sz
-#define NLM_cancargs_sz         NLM_cookie_sz+2+NLM_lock_sz
-#define NLM_unlockargs_sz       NLM_cookie_sz+NLM_lock_sz
-#define NLM_testres_sz          NLM_cookie_sz+1+NLM_holder_sz
-#define NLM_res_sz              NLM_cookie_sz+1
-#define NLM_norep_sz            0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlmclt_decode_norep     NULL
-#define PROC(proc, argtype, restype)    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,            \
-        .p_decode    = (kxdrproc_t) nlmclt_decode_##restype,            \
-        .p_arglen    = NLM_##argtype##_sz,                              \
-        .p_replen    = NLM_##restype##_sz,                              \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-static struct rpc_version       nlm_version1 = {
-                .number         = 1,
-                .nrprocs        = 16,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version       nlm_version3 = {
-                .number         = 3,
-                .nrprocs        = 24,
-                .procs          = nlm_procedures,
-};
-static struct rpc_version *     nlm_versions[] = {
-        [1] = &nlm_version1,
-        [3] = &nlm_version3,
-#ifdef  CONFIG_LOCKD_V4
-        [4] = &nlm_version4,
-#endif
-};
-static struct rpc_stat          nlm_stats;
-struct rpc_program              nlm_program = {
-                .name           = "lockd",
-                .number         = NLM_PROGRAM,
-                .nrvers         = ARRAY_SIZE(nlm_versions),
-                .version        = nlm_versions,
-                .stats          = &nlm_stats,
-};
-#ifdef RPC_DEBUG
-const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
-{
-        /*
-         * We can get away with a static buffer because we're only
-         * called with BKL held.
-         */
-        static char buf[2*NLM_MAXCOOKIELEN+1];
-        unsigned int i, len = sizeof(buf);
-        char *p = buf;
-        len--;  /* allow for trailing \0 */
-        if (len < 3)
-                return "???";
-        for (i = 0 ; i < cookie->len ; i++) {
-                if (len < 2) {
-                        strcpy(p-3, "...");
-                        break;
-                }
-                sprintf(p, "%02x", cookie->data[i]);
-                p += 2;
-                len -= 2;
-        }
-        *p = '\0';
-        return buf;
-}
-#endif
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ad9dbbc9145d..dfa4789cd460 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(f->size);
 }
-static __be32 *
-nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-        *p++ = htonl(f->size);
-        if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
-        memcpy(p, f->data, f->size);
-        return p + XDR_QUADLEN(f->size);
-}
 /*
 * Encode and decode owner handle
 */
@@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
 }
 static __be32 *
-nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
-{
-        return xdr_encode_netobj(p, oh);
-}
-static __be32 *
 nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
        struct file_lock        *fl = &lock->fl;
@@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 }
 /*
- * Encode a lock as part of an NLM call
- */
-static __be32 *
-nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
-{
-        struct file_lock        *fl = &lock->fl;
-        __s64                   start, len;
-        if (!(p = xdr_encode_string(p, lock->caller))
-         || !(p = nlm4_encode_fh(p, &lock->fh))
-         || !(p = nlm4_encode_oh(p, &lock->oh)))
-                return NULL;
-        if (fl->fl_start > NLM4_OFFSET_MAX
-         || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
-                return NULL;
-        *p++ = htonl(lock->svid);
-        start = loff_t_to_s64(fl->fl_start);
-        if (fl->fl_end == OFFSET_MAX)
-                len = 0;
-        else
-                len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
-        p = xdr_encode_hyper(p, start);
-        p = xdr_encode_hyper(p, len);
-        return p;
-}
-/*
 * Encode result of a TEST/TEST_MSG call
 */
 static __be32 *
@@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-/*
- * Now, the client side XDR functions
- */
-#ifdef NLMCLNT_SUPPORT_SHARES
-static int
-nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
-{
-        return 0;
-}
-#endif
-static int
-nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        if (resp->status == nlm_lck_denied) {
-                struct file_lock        *fl = &resp->lock.fl;
-                u32                     excl;
-                __u64                   start, len;
-                __s64                   end;
-                memset(&resp->lock, 0, sizeof(resp->lock));
-                locks_init_lock(fl);
-                excl = ntohl(*p++);
-                resp->lock.svid = ntohl(*p++);
-                fl->fl_pid = (pid_t)resp->lock.svid;
-                if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
-                        return -EIO;
-                fl->fl_flags = FL_POSIX;
-                fl->fl_type  = excl? F_WRLCK : F_RDLCK;
-                p = xdr_decode_hyper(p, &start);
-                p = xdr_decode_hyper(p, &len);
-                end = start + len - 1;
-                fl->fl_start = s64_to_loff_t(start);
-                if (len == 0 || end < 0)
-                        fl->fl_end = OFFSET_MAX;
-                else
-                        fl->fl_end = s64_to_loff_t(end);
-        }
-        return 0;
-}
-static int
-nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        *p++ = argp->reclaim? xdr_one : xdr_zero;
-        *p++ = htonl(argp->state);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        *p++ = argp->block? xdr_one : xdr_zero;
-        *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
-{
-        struct nlm_lock *lock = &argp->lock;
-        if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
-                return -EIO;
-        if (!(p = nlm4_encode_lock(p, lock)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
-                return -EIO;
-        *p++ = resp->status;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_encode_testres(p, resp)))
-                return -EIO;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
-}
-static int
-nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
-{
-        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
-                return -EIO;
-        resp->status = *p++;
-        return 0;
-}
-#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
-#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
-#endif
-#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
-#  error "NLM host name cannot be larger than NLM's maximum string length!"
-#endif
-/*
- * Buffer requirements for NLM
- */
-#define NLM4_void_sz            0
-#define NLM4_cookie_sz          1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM4_caller_sz          1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_owner_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_fhandle_sz         1+XDR_QUADLEN(NFS3_FHSIZE)
-#define NLM4_lock_sz            5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
-#define NLM4_holder_sz          6+NLM4_owner_sz
-#define NLM4_testargs_sz        NLM4_cookie_sz+1+NLM4_lock_sz
-#define NLM4_lockargs_sz        NLM4_cookie_sz+4+NLM4_lock_sz
-#define NLM4_cancargs_sz        NLM4_cookie_sz+2+NLM4_lock_sz
-#define NLM4_unlockargs_sz      NLM4_cookie_sz+NLM4_lock_sz
-#define NLM4_testres_sz         NLM4_cookie_sz+1+NLM4_holder_sz
-#define NLM4_res_sz             NLM4_cookie_sz+1
-#define NLM4_norep_sz           0
-/*
- * For NLM, a void procedure really returns nothing
- */
-#define nlm4clt_decode_norep    NULL
-#define PROC(proc, argtype, restype)                                    \
-[NLMPROC_##proc] = {                                                    \
-        .p_proc      = NLMPROC_##proc,                                  \
-        .p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,           \
-        .p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,           \
-        .p_arglen    = NLM4_##argtype##_sz,                             \
-        .p_replen    = NLM4_##restype##_sz,                             \
-        .p_statidx   = NLMPROC_##proc,                                  \
-        .p_name      = #proc,                                           \
-        }
-static struct rpc_procinfo      nlm4_procedures[] = {
-    PROC(TEST,          testargs,       testres),
-    PROC(LOCK,          lockargs,       res),
-    PROC(CANCEL,        cancargs,       res),
-    PROC(UNLOCK,        unlockargs,     res),
-    PROC(GRANTED,       testargs,       res),
-    PROC(TEST_MSG,      testargs,       norep),
-    PROC(LOCK_MSG,      lockargs,       norep),
-    PROC(CANCEL_MSG,    cancargs,       norep),
-    PROC(UNLOCK_MSG,    unlockargs,     norep),
-    PROC(GRANTED_MSG,   testargs,       norep),
-    PROC(TEST_RES,      testres,        norep),
-    PROC(LOCK_RES,      res,            norep),
-    PROC(CANCEL_RES,    res,            norep),
-    PROC(UNLOCK_RES,    res,            norep),
-    PROC(GRANTED_RES,   res,            norep),
-#ifdef NLMCLNT_SUPPORT_SHARES
-    PROC(SHARE,         shareargs,      shareres),
-    PROC(UNSHARE,       shareargs,      shareres),
-    PROC(NM_LOCK,       lockargs,       res),
-    PROC(FREE_ALL,      notify,         void),
-#endif
-};
-struct rpc_version      nlm_version4 = {
-        .number         = 4,
-        .nrprocs        = 24,
-        .procs          = nlm4_procedures,
-};
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237b..a25444ab2baf 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
 EXPORT_SYMBOL(mb_cache_entry_find_next);
 #endif
-struct mb_cache {
-        struct list_head                c_cache_list;
-        const char                      *c_name;
-        atomic_t                        c_entry_count;
-        int                             c_max_entries;
-        int                             c_bucket_bits;
-        struct kmem_cache               *c_entry_cache;
-        struct list_head                *c_block_hash;
-        struct list_head                *c_index_hash;
-};
 /*
 * Global data: list of all mbcache's, lru list, and a spinlock for
 * accessing cache data structures on SMP machines. The lru list is
diff --git a/fs/namei.c b/fs/namei.c
index 19433cdba011..24ece10470b6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -202,7 +202,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
 * @inode:      inode to check access rights for
 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 * @check_acl:  optional callback to check for Posix ACLs
- * @flags       IPERM_FLAG_ flags.
+ * @flags:      IPERM_FLAG_ flags.
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
@@ -407,7 +407,7 @@ void path_put_long(struct path *path)
 /**
 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
 * @nd: nameidata pathwalk data to drop
- * @Returns: 0 on success, -ECHLID on failure
+ * Returns: 0 on success, -ECHILD on failure
 *
 * Path walking has 2 modes, rcu-walk and ref-walk (see
 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
@@ -468,7 +468,7 @@ static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
 * @nd: nameidata pathwalk data to drop
 * @dentry: dentry to drop
- * @Returns: 0 on success, -ECHLID on failure
+ * Returns: 0 on success, -ECHILD on failure
 *
 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
@@ -530,7 +530,7 @@ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct d
 /**
 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
 * @nd: nameidata pathwalk data to drop
- * @Returns: 0 on success, -ECHLID on failure
+ * Returns: 0 on success, -ECHILD on failure
 *
 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
 * nd->path should be the final element of the lookup, so nd->root is discarded.
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 93a8b3bd69e3..199016528fcb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,9 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#if defined(CONFIG_NFS_V4_1)
 #include <linux/sunrpc/bc_xprt.h>
-#endif
 #include <net/inet_sock.h>
@@ -137,6 +135,33 @@ out_err:
 #if defined(CONFIG_NFS_V4_1)
 /*
+ *  * CB_SEQUENCE operations will fail until the callback sessionid is set.
+ *   */
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+        struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
+        struct nfs4_sessionid *bc_sid;
+        if (!serv->sv_bc_xprt)
+                return -EINVAL;
+        /* on success freed in xprt_free */
+        bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
+        if (!bc_sid)
+                return -ENOMEM;
+        memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
+                NFS4_MAX_SESSIONID_LEN);
+        spin_lock_bh(&serv->sv_cb_lock);
+        serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
+        spin_unlock_bh(&serv->sv_cb_lock);
+        dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
+                ((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
+                ((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
+                serv->sv_bc_xprt);
+        return 0;
+}
+/*
 * The callback service for NFSv4.1 callbacks
 */
 static int
@@ -177,30 +202,38 @@ nfs41_callback_svc(void *vrqstp)
 struct svc_rqst *
 nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
 {
-        struct svc_xprt *bc_xprt;
+        struct svc_rqst *rqstp;
-        struct svc_rqst *rqstp = ERR_PTR(-ENOMEM);
+        int ret;
-        dprintk("--> %s\n", __func__);
+        /*
-        /* Create a svc_sock for the service */
+         * Create an svc_sock for the back channel service that shares the
-        bc_xprt = svc_sock_create(serv, xprt->prot);
+         * fore channel connection.
-        if (!bc_xprt)
+         * Returns the input port (0) and sets the svc_serv bc_xprt on success
+         */
+        ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+                              SVC_SOCK_ANONYMOUS);
+        if (ret < 0) {
+                rqstp = ERR_PTR(ret);
                goto out;
+        }
        /*
         * Save the svc_serv in the transport so that it can
         * be referenced when the session backchannel is initialized
         */
-        serv->bc_xprt = bc_xprt;
        xprt->bc_serv = serv;
        INIT_LIST_HEAD(&serv->sv_cb_list);
        spin_lock_init(&serv->sv_cb_lock);
        init_waitqueue_head(&serv->sv_cb_waitq);
        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-        if (IS_ERR(rqstp))
+        if (IS_ERR(rqstp)) {
-                svc_sock_destroy(bc_xprt);
+                svc_xprt_put(serv->sv_bc_xprt);
+                serv->sv_bc_xprt = NULL;
+        }
 out:
-        dprintk("--> %s return %p\n", __func__, rqstp);
+        dprintk("--> %s return %ld\n", __func__,
+                IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
        return rqstp;
 }
@@ -233,6 +266,10 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
                struct nfs_callback_data *cb_info)
 {
 }
+int nfs4_set_callback_sessionid(struct nfs_client *clp)
+{
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /*
@@ -328,6 +365,9 @@ static int check_gss_callback_principal(struct nfs_client *clp,
        struct rpc_clnt *r = clp->cl_rpcclient;
        char *p = svc_gss_principal(rqstp);
+        /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+        if (clp->cl_minorversion != 0)
+                return SVC_DROP;
        /*
         * It might just be a normal user principal, in which case
         * userspace won't bother to tell us the name at all.
@@ -345,6 +385,23 @@ static int check_gss_callback_principal(struct nfs_client *clp,
        return SVC_OK;
 }
+/* pg_authenticate method helper */
+static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp)
+{
+        struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp);
+        int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0;
+        dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc);
+        if (svc_is_backchannel(rqstp))
+                /* Sessionid (usually) set after CB_NULL ping */
+                return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
+                                                  is_cb_compound);
+        else
+                /* No callback identifier in pg_authenticate */
+                return nfs4_find_client_no_ident(svc_addr(rqstp));
+}
+/* pg_authenticate method for nfsv4 callback threads. */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
        struct nfs_client *clp;
@@ -352,7 +409,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
        int ret = SVC_OK;
        /* Don't talk to strangers */
-        clp = nfs_find_client(svc_addr(rqstp), 4);
+        clp = nfs_cb_find_client(rqstp);
        if (clp == NULL)
                return SVC_DROP;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8dd..d3b44f9bd747 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -34,10 +34,17 @@ enum nfs4_callback_opnum {
        OP_CB_ILLEGAL = 10044,
 };
+struct cb_process_state {
+        __be32                  drc_status;
+        struct nfs_client       *clp;
+        struct nfs4_sessionid   *svc_sid; /* v4.1 callback service sessionid */
+};
 struct cb_compound_hdr_arg {
        unsigned int taglen;
        const char *tag;
        unsigned int minorversion;
+        unsigned int cb_ident; /* v4.0 callback identifier */
        unsigned nops;
 };
@@ -103,14 +110,23 @@ struct cb_sequenceres {
        uint32_t                        csr_target_highestslotid;
 };
-extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                       struct cb_sequenceres *res);
+                                       struct cb_sequenceres *res,
+                                       struct cb_process_state *cps);
 extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
                                             const nfs4_stateid *stateid);
 #define RCA4_TYPE_MASK_RDATA_DLG        0
 #define RCA4_TYPE_MASK_WDATA_DLG        1
+#define RCA4_TYPE_MASK_DIR_DLG         2
+#define RCA4_TYPE_MASK_FILE_LAYOUT     3
+#define RCA4_TYPE_MASK_BLK_LAYOUT      4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
 struct cb_recallanyargs {
        struct sockaddr *craa_addr;
@@ -118,25 +134,52 @@ struct cb_recallanyargs {
        uint32_t        craa_type_mask;
 };
-extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+                                        void *dummy,
+                                        struct cb_process_state *cps);
 struct cb_recallslotargs {
        struct sockaddr *crsa_addr;
        uint32_t        crsa_target_max_slots;
 };
-extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
-                                          void *dummy);
+                                         void *dummy,
+                                         struct cb_process_state *cps);
+struct cb_layoutrecallargs {
+        struct sockaddr         *cbl_addr;
+        uint32_t                cbl_recall_type;
+        uint32_t                cbl_layout_type;
+        uint32_t                cbl_layoutchanged;
+        union {
+                struct {
+                        struct nfs_fh           cbl_fh;
+                        struct pnfs_layout_range cbl_range;
+                        nfs4_stateid            cbl_stateid;
+                };
+                struct nfs_fsid         cbl_fsid;
+        };
+};
-#endif /* CONFIG_NFS_V4_1 */
+extern unsigned nfs4_callback_layoutrecall(
+        struct cb_layoutrecallargs *args,
+        void *dummy, struct cb_process_state *cps);
-extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
+extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
-extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+extern void nfs4_cb_take_slot(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4_1 */
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                                    struct cb_getattrres *res,
+                                    struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                                   struct cb_process_state *cps);
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
 extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
                                            const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4 */
 /*
 * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2950fca0c61b..4bb91cb2620d 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,30 +12,33 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 #ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 #endif
- 
-__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+                             struct cb_getattrres *res,
+                             struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs_delegation *delegation;
        struct nfs_inode *nfsi;
        struct inode *inode;
+        res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+                goto out;
        res->bitmap[0] = res->bitmap[1] = 0;
        res->status = htonl(NFS4ERR_BADHANDLE);
-        clp = nfs_find_client(args->addr, 4);
-        if (clp == NULL)
-                goto out;
        dprintk("NFS: GETATTR callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        inode = nfs_delegation_find_inode(clp, &args->fh);
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
        if (inode == NULL)
-                goto out_putclient;
+                goto out;
        nfsi = NFS_I(inode);
        rcu_read_lock();
        delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 out_iput:
        rcu_read_unlock();
        iput(inode);
-out_putclient:
-        nfs_put_client(clp);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
        return res->status;
 }
-__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+                            struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct inode *inode;
        __be32 res;
        
-        res = htonl(NFS4ERR_BADHANDLE);
+        res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->addr, 4);
+        if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-        do {
+        res = htonl(NFS4ERR_BADHANDLE);
-                struct nfs_client *prev = clp;
+        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+        if (inode == NULL)
-                inode = nfs_delegation_find_inode(clp, &args->fh);
+                goto out;
-                if (inode != NULL) {
+        /* Set up a helper thread to actually return the delegation */
-                        /* Set up a helper thread to actually return the delegation */
+        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
-                        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+        case 0:
-                                case 0:
+                res = 0;
-                                        res = 0;
+                break;
-                                        break;
+        case -ENOENT:
-                                case -ENOENT:
+                if (res != 0)
-                                        if (res != 0)
+                        res = htonl(NFS4ERR_BAD_STATEID);
-                                                res = htonl(NFS4ERR_BAD_STATEID);
+                break;
-                                        break;
+        default:
-                                default:
+                res = htonl(NFS4ERR_RESOURCE);
-                                        res = htonl(NFS4ERR_RESOURCE);
+        }
-                        }
+        iput(inode);
-                        iput(inode);
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
        return res;
@@ -113,6 +108,139 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
 #if defined(CONFIG_NFS_V4_1)
+static u32 initiate_file_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        bool found = false;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        LIST_HEAD(free_me_list);
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if (nfs_compare_fh(&args->cbl_fh,
+                                   &NFS_I(lo->plh_inode)->fh))
+                        continue;
+                ino = igrab(lo->plh_inode);
+                if (!ino)
+                        continue;
+                found = true;
+                /* Without this, layout can be freed as soon
+                 * as we release cl_lock.
+                 */
+                get_layout_hdr(lo);
+                break;
+        }
+        spin_unlock(&clp->cl_lock);
+        if (!found)
+                return NFS4ERR_NOMATCHING_LAYOUT;
+        spin_lock(&ino->i_lock);
+        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+            mark_matching_lsegs_invalid(lo, &free_me_list,
+                                        args->cbl_range.iomode))
+                rv = NFS4ERR_DELAY;
+        else
+                rv = NFS4ERR_NOMATCHING_LAYOUT;
+        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&free_me_list);
+        put_layout_hdr(lo);
+        iput(ino);
+        return rv;
+}
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+                                  struct cb_layoutrecallargs *args)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *ino;
+        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+        struct pnfs_layout_hdr *tmp;
+        LIST_HEAD(recall_list);
+        LIST_HEAD(free_me_list);
+        struct pnfs_layout_range range = {
+                .iomode = IOMODE_ANY,
+                .offset = 0,
+                .length = NFS4_MAX_UINT64,
+        };
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+                if ((args->cbl_recall_type == RETURN_FSID) &&
+                    memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+                           &args->cbl_fsid, sizeof(struct nfs_fsid)))
+                        continue;
+                if (!igrab(lo->plh_inode))
+                        continue;
+                get_layout_hdr(lo);
+                BUG_ON(!list_empty(&lo->plh_bulk_recall));
+                list_add(&lo->plh_bulk_recall, &recall_list);
+        }
+        spin_unlock(&clp->cl_lock);
+        list_for_each_entry_safe(lo, tmp,
+                                 &recall_list, plh_bulk_recall) {
+                ino = lo->plh_inode;
+                spin_lock(&ino->i_lock);
+                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+                        rv = NFS4ERR_DELAY;
+                list_del_init(&lo->plh_bulk_recall);
+                spin_unlock(&ino->i_lock);
+                put_layout_hdr(lo);
+                iput(ino);
+        }
+        pnfs_free_lseg_list(&free_me_list);
+        return rv;
+}
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+                                    struct cb_layoutrecallargs *args)
+{
+        u32 res = NFS4ERR_DELAY;
+        dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+        if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+                goto out;
+        if (args->cbl_recall_type == RETURN_FILE)
+                res = initiate_file_draining(clp, args);
+        else
+                res = initiate_bulk_draining(clp, args);
+        clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+        dprintk("%s returning %i\n", __func__, res);
+        return res;
+}
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+                                  void *dummy, struct cb_process_state *cps)
+{
+        u32 res;
+        dprintk("%s: -->\n", __func__);
+        if (cps->clp)
+                res = do_callback_layoutrecall(cps->clp, args);
+        else
+                res = NFS4ERR_OP_NOT_IN_SESSION;
+        dprintk("%s: exit with status = %d\n", __func__, res);
+        return cpu_to_be32(res);
+}
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+        struct cb_layoutrecallargs args;
+        /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+        memset(&args, 0, sizeof(args));
+        args.cbl_recall_type = RETURN_ALL;
+        /* FIXME we ignore errors, what should we do? */
+        do_callback_layoutrecall(clp, &args);
+}
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
 {
        if (delegation == NULL)
@@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 }
 /*
- * Returns a pointer to a held 'struct nfs_client' that matches the server's
- * address, major version number, and session ID.  It is the caller's
- * responsibility to release the returned reference.
- *
- * Returns NULL if there are no connections with sessions, or if no session
- * matches the one of interest.
- */
- static struct nfs_client *find_client_with_session(
-        const struct sockaddr *addr, u32 nfsversion,
-        struct nfs4_sessionid *sessionid)
-{
-        struct nfs_client *clp;
-        clp = nfs_find_client(addr, 4);
-        if (clp == NULL)
-                return NULL;
-        do {
-                struct nfs_client *prev = clp;
-                if (clp->cl_session != NULL) {
-                        if (memcmp(clp->cl_session->sess_id.data,
-                                        sessionid->data,
-                                        NFS4_MAX_SESSIONID_LEN) == 0) {
-                                /* Returns a held reference to clp */
-                                return clp;
-                        }
-                }
-                clp = nfs_find_client_next(prev);
-                nfs_put_client(prev);
-        } while (clp != NULL);
-        return NULL;
-}
-/*
 * For each referring call triple, check the session's slot table for
 * a match.  If the slot is in use and the sequence numbers match, the
 * client is still waiting for a response to the original request.
@@ -276,20 +368,34 @@ out:
 }
 __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
-                                struct cb_sequenceres *res)
+                              struct cb_sequenceres *res,
+                              struct cb_process_state *cps)
 {
        struct nfs_client *clp;
        int i;
        __be32 status;
+        cps->clp = NULL;
        status = htonl(NFS4ERR_BADSESSION);
-        clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
+        /* Incoming session must match the callback session */
+        if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
+                goto out;
+        clp = nfs4_find_client_sessionid(args->csa_addr,
+                                         &args->csa_sessionid, 1);
        if (clp == NULL)
                goto out;
+        /* state manager is resetting the session */
+        if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+                status = NFS4ERR_DELAY;
+                goto out;
+        }
        status = validate_seqid(&clp->cl_session->bc_slot_table, args);
        if (status)
-                goto out_putclient;
+                goto out;
        /*
         * Check for pending referring calls.  If a match is found, a
@@ -298,7 +404,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
         */
        if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
                status = htonl(NFS4ERR_DELAY);
-                goto out_putclient;
+                goto out;
        }
        memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,83 +413,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        res->csr_slotid = args->csa_slotid;
        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+        nfs4_cb_take_slot(clp);
+        cps->clp = clp; /* put in nfs4_callback_compound */
-out_putclient:
-        nfs_put_client(clp);
 out:
        for (i = 0; i < args->csa_nrclists; i++)
                kfree(args->csa_rclists[i].rcl_refcalls);
        kfree(args->csa_rclists);
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
+        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                res->csr_status = 0;
+                cps->drc_status = status;
-        else
+                status = 0;
+        } else
                res->csr_status = status;
        dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
                ntohl(status), ntohl(res->csr_status));
        return status;
 }
-__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+        return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+                               struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        __be32 status;
        fmode_t flags = 0;
-        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->craa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: RECALL_ANY callback request from %s\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+        status = cpu_to_be32(NFS4ERR_INVAL);
+        if (!validate_bitmap_values(args->craa_type_mask))
+                goto out;
+        status = cpu_to_be32(NFS4_OK);
        if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags = FMODE_READ;
        if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
                     &args->craa_type_mask))
                flags |= FMODE_WRITE;
+        if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+                     &args->craa_type_mask))
+                pnfs_recall_all_layouts(cps->clp);
        if (flags)
-                nfs_expire_all_delegation_types(clp, flags);
+                nfs_expire_all_delegation_types(cps->clp, flags);
-        status = htonl(NFS4_OK);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
 }
 /* Reduce the fore channel's max_slots to the target value */
-__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+                                struct cb_process_state *cps)
 {
-        struct nfs_client *clp;
        struct nfs4_slot_table *fc_tbl;
        __be32 status;
        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-        clp = nfs_find_client(args->crsa_addr, 4);
+        if (!cps->clp) /* set in cb_sequence */
-        if (clp == NULL)
                goto out;
        dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
-                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
                args->crsa_target_max_slots);
-        fc_tbl = &clp->cl_session->fc_slot_table;
+        fc_tbl = &cps->clp->cl_session->fc_slot_table;
        status = htonl(NFS4ERR_BAD_HIGH_SLOT);
        if (args->crsa_target_max_slots > fc_tbl->max_slots ||
            args->crsa_target_max_slots < 1)
-                goto out_putclient;
+                goto out;
        status = htonl(NFS4_OK);
        if (args->crsa_target_max_slots == fc_tbl->max_slots)
-                goto out_putclient;
+                goto out;
        fc_tbl->target_max_slots = args->crsa_target_max_slots;
-        nfs41_handle_recall_slot(clp);
+        nfs41_handle_recall_slot(cps->clp);
-out_putclient:
-        nfs_put_client(clp);    /* balance nfs_find_client */
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0edf..23112c263f81 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include "nfs4_fs.h"
 #include "callback.h"
+#include "internal.h"
 #define CB_OP_TAGLEN_MAXSZ      (512)
 #define CB_OP_HDR_RES_MAXSZ     (2 + CB_OP_TAGLEN_MAXSZ)
@@ -22,6 +24,7 @@
 #define CB_OP_RECALL_RES_MAXSZ  (CB_OP_HDR_RES_MAXSZ)
 #if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ    (CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ + \
                                        4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ       (CB_OP_HDR_RES_MAXSZ)
@@ -33,7 +36,8 @@
 /* Internal error code */
 #define NFS4ERR_RESOURCE_HDR    11050
-typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *,
+                                        struct cb_process_state *);
 typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
 typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        hdr->minorversion = ntohl(*p++);
        /* Check minor version is zero or one. */
        if (hdr->minorversion <= 1) {
-                p++;    /* skip callback_ident */
+                hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
        } else {
                printk(KERN_WARNING "%s: NFSv4 server callback with "
                        "illegal minor version %u!\n",
@@ -220,6 +224,66 @@ out:
 #if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct cb_layoutrecallargs *args)
+{
+        __be32 *p;
+        __be32 status = 0;
+        uint32_t iomode;
+        args->cbl_addr = svc_addr(rqstp);
+        p = read_buf(xdr, 4 * sizeof(uint32_t));
+        if (unlikely(p == NULL)) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        args->cbl_layout_type = ntohl(*p++);
+        /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+         * as it is unuseable and ignored with the other types.
+         */
+        iomode = ntohl(*p++);
+        args->cbl_layoutchanged = ntohl(*p++);
+        args->cbl_recall_type = ntohl(*p++);
+        if (args->cbl_recall_type == RETURN_FILE) {
+                args->cbl_range.iomode = iomode;
+                status = decode_fh(xdr, &args->cbl_fh);
+                if (unlikely(status != 0))
+                        goto out;
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_range.offset);
+                p = xdr_decode_hyper(p, &args->cbl_range.length);
+                status = decode_stateid(xdr, &args->cbl_stateid);
+                if (unlikely(status != 0))
+                        goto out;
+        } else if (args->cbl_recall_type == RETURN_FSID) {
+                p = read_buf(xdr, 2 * sizeof(uint64_t));
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto out;
+                }
+                p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+                p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+        } else if (args->cbl_recall_type != RETURN_ALL) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+                __func__,
+                args->cbl_layout_type, iomode,
+                args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+}
 static __be32 decode_sessionid(struct xdr_stream *xdr,
                                 struct nfs4_sessionid *sid)
 {
@@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_SEQUENCE:
        case OP_CB_RECALL_ANY:
        case OP_CB_RECALL_SLOT:
+        case OP_CB_LAYOUTRECALL:
                *op = &callback_ops[op_nr];
                break;
-        case OP_CB_LAYOUTRECALL:
        case OP_CB_NOTIFY_DEVICEID:
        case OP_CB_NOTIFY:
        case OP_CB_PUSH_DELEG:
@@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS_OK);
 }
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+        struct nfs4_slot_table *tbl = &session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        /*
+         * Let the state manager know callback processing done.
+         * A single slot, so highest used slotid is either 0 or -1
+         */
+        tbl->highest_used_slotid--;
+        nfs4_check_drain_bc_complete(session);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+        if (clp && clp->cl_session)
+                nfs4_callback_free_slot(clp->cl_session);
+}
+/* A single slot, so highest used slotid is either 0 or -1 */
+void nfs4_cb_take_slot(struct nfs_client *clp)
+{
+        struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        tbl->highest_used_slotid++;
+        BUG_ON(tbl->highest_used_slotid != 0);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
 #else /* CONFIG_NFS_V4_1 */
 static __be32
@@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
+static void nfs4_cb_free_slot(struct nfs_client *clp)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 static __be32
@@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 static __be32 process_op(uint32_t minorversion, int nop,
                struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
-                struct xdr_stream *xdr_out, void *resp, int* drc_status)
+                struct xdr_stream *xdr_out, void *resp,
+                struct cb_process_state *cps)
 {
        struct callback_op *op = &callback_ops[0];
        unsigned int op_nr;
@@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (status)
                goto encode_hdr;
-        if (*drc_status) {
+        if (cps->drc_status) {
-                status = *drc_status;
+                status = cps->drc_status;
                goto encode_hdr;
        }
@@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
        if (maxlen > 0 && maxlen < PAGE_SIZE) {
                status = op->decode_args(rqstp, xdr_in, argp);
                if (likely(status == 0))
-                        status = op->process_op(argp, resp);
+                        status = op->process_op(argp, resp, cps);
        } else
                status = htonl(NFS4ERR_RESOURCE);
-        /* Only set by OP_CB_SEQUENCE processing */
-        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
-                *drc_status = status;
-                status = 0;
-        }
 encode_hdr:
        res = encode_op_hdr(xdr_out, op_nr, status);
        if (unlikely(res))
@@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_compound_hdr_arg hdr_arg = { 0 };
        struct cb_compound_hdr_res hdr_res = { NULL };
        struct xdr_stream xdr_in, xdr_out;
-        __be32 *p;
+        __be32 *p, status;
-        __be32 status, drc_status = 0;
+        struct cb_process_state cps = {
+                .drc_status = 0,
+                .clp = NULL,
+        };
        unsigned int nops = 0;
        dprintk("%s: start\n", __func__);
@@ -696,6 +792,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        if (status == __constant_htonl(NFS4ERR_RESOURCE))
                return rpc_garbage_args;
+        if (hdr_arg.minorversion == 0) {
+                cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+                if (!cps.clp)
+                        return rpc_drop_reply;
+        } else
+                cps.svc_sid = bc_xprt_sid(rqstp);
        hdr_res.taglen = hdr_arg.taglen;
        hdr_res.tag = hdr_arg.tag;
        if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +806,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        while (status == 0 && nops != hdr_arg.nops) {
                status = process_op(hdr_arg.minorversion, nops, rqstp,
-                                    &xdr_in, argp, &xdr_out, resp, &drc_status);
+                                    &xdr_in, argp, &xdr_out, resp, &cps);
                nops++;
        }
@@ -716,6 +819,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        *hdr_res.status = status;
        *hdr_res.nops = htonl(nops);
+        nfs4_cb_free_slot(cps.clp);
+        nfs_put_client(cps.clp);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
        return rpc_success;
 }
@@ -739,6 +844,12 @@ static struct callback_op callback_ops[] = {
                .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
        },
 #if defined(CONFIG_NFS_V4_1)
+        [OP_CB_LAYOUTRECALL] = {
+                .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+                .decode_args =
+                        (callback_decode_arg_t)decode_layoutrecall_args,
+                .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+        },
        [OP_CB_SEQUENCE] = {
                .process_op = (callback_process_op_t)nfs4_callback_sequence,
                .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0870d0d4efc0..192f2f860265 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -56,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
 static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+#ifdef CONFIG_NFS_V4
+static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+        int ret = 0;
+        if (clp->rpc_ops->version != 4 || minorversion != 0)
+                return ret;
+retry:
+        if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+                return -ENOMEM;
+        spin_lock(&nfs_client_lock);
+        ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+        spin_unlock(&nfs_client_lock);
+        if (ret == -EAGAIN)
+                goto retry;
+        return ret;
+}
+#endif /* CONFIG_NFS_V4 */
 /*
 * RPC cruft for NFS
@@ -144,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_proto = cl_init->proto;
 #ifdef CONFIG_NFS_V4
-        INIT_LIST_HEAD(&clp->cl_delegations);
+        err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+        if (err)
+                goto error_cleanup;
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -170,21 +197,17 @@ error_0:
 }
 #ifdef CONFIG_NFS_V4
-/*
- * Clears/puts all minor version specific parts from an nfs_client struct
- * reverting it to minorversion 0.
- */
-static void nfs4_clear_client_minor_version(struct nfs_client *clp)
-{
 #ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(clp)) {
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+        if (nfs4_has_session(clp))
                nfs4_destroy_session(clp->cl_session);
-                clp->cl_session = NULL;
-        }
-        clp->cl_mvops = nfs_v4_minor_ops[0];
-#endif /* CONFIG_NFS_V4_1 */
 }
+#else /* CONFIG_NFS_V4_1 */
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * Destroy the NFS4 callback service
@@ -199,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 {
        if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
                nfs4_kill_renewd(clp);
-        nfs4_clear_client_minor_version(clp);
+        nfs4_shutdown_session(clp);
        nfs4_destroy_callback(clp);
        if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
                nfs_idmap_delete(clp);
        rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 }
+/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
+void nfs_cleanup_cb_ident_idr(void)
+{
+        idr_destroy(&cb_ident_idr);
+}
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+        if (clp->cl_cb_ident)
+                idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+        rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
 }
+void nfs_cleanup_cb_ident_idr(void)
+{
+}
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
 #endif /* CONFIG_NFS_V4 */
 /*
@@ -248,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp)
        if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
                list_del(&clp->cl_share_link);
+                nfs_cb_idr_remove_locked(clp);
                spin_unlock(&nfs_client_lock);
                BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -363,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
        return 0;
 }
-/*
+/* Common match routine for v4.0 and v4.1 callback services */
- * Find a client by IP address and protocol version
+bool
- * - returns NULL if no such client
+nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
- */
+                     u32 minorversion)
-struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
-{
-        struct nfs_client *clp;
-        spin_lock(&nfs_client_lock);
-        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-                /* Don't match clients that failed to initialise properly */
-                if (!(clp->cl_cons_state == NFS_CS_READY ||
-                      clp->cl_cons_state == NFS_CS_SESSION_INITING))
-                        continue;
-                /* Different NFS versions cannot share the same nfs_client */
-                if (clp->rpc_ops->version != nfsversion)
-                        continue;
-                /* Match only the IP address, not the port number */
-                if (!nfs_sockaddr_match_ipaddr(addr, clap))
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
-}
-/*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
- */
-struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
 {
-        struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
+        struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
-        u32 nfsvers = clp->rpc_ops->version;
-        spin_lock(&nfs_client_lock);
+        /* Don't match clients that failed to initialise */
-        list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
+        if (!(clp->cl_cons_state == NFS_CS_READY ||
-                struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+            clp->cl_cons_state == NFS_CS_SESSION_INITING))
+                return false;
-                /* Don't match clients that failed to initialise properly */
+        /* Match the version and minorversion */
-                if (clp->cl_cons_state != NFS_CS_READY)
+        if (clp->rpc_ops->version != 4 ||
-                        continue;
+            clp->cl_minorversion != minorversion)
+                return false;
-                /* Different NFS versions cannot share the same nfs_client */
+        /* Match only the IP address, not the port number */
-                if (clp->rpc_ops->version != nfsvers)
+        if (!nfs_sockaddr_match_ipaddr(addr, clap))
-                        continue;
+                return false;
-                /* Match only the IP address, not the port number */
+        return true;
-                if (!nfs_sockaddr_match_ipaddr(sap, clap))
-                        continue;
-                atomic_inc(&clp->cl_count);
-                spin_unlock(&nfs_client_lock);
-                return clp;
-        }
-        spin_unlock(&nfs_client_lock);
-        return NULL;
 }
 /*
@@ -988,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
        target->options = source->options;
 }
+static void nfs_server_insert_lists(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        spin_lock(&nfs_client_lock);
+        list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+}
+static void nfs_server_remove_lists(struct nfs_server *server)
+{
+        spin_lock(&nfs_client_lock);
+        list_del_rcu(&server->client_link);
+        list_del(&server->master_link);
+        spin_unlock(&nfs_client_lock);
+        synchronize_rcu();
+}
 /*
 * Allocate and initialise a server record
 */
@@ -1004,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
        /* Zero out the NFS state stuff */
        INIT_LIST_HEAD(&server->client_link);
        INIT_LIST_HEAD(&server->master_link);
+        INIT_LIST_HEAD(&server->delegations);
        atomic_set(&server->active, 0);
@@ -1019,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void)
                return NULL;
        }
+        pnfs_init_server(server);
        return server;
 }
@@ -1029,11 +1067,8 @@ void nfs_free_server(struct nfs_server *server)
 {
        dprintk("--> nfs_free_server()\n");
+        nfs_server_remove_lists(server);
        unset_pnfs_layoutdriver(server);
-        spin_lock(&nfs_client_lock);
-        list_del(&server->client_link);
-        list_del(&server->master_link);
-        spin_unlock(&nfs_client_lock);
        if (server->destroy != NULL)
                server->destroy(server);
@@ -1108,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                (unsigned long long) server->fsid.major,
                (unsigned long long) server->fsid.minor);
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr);
        return server;
@@ -1125,6 +1156,101 @@ error:
 #ifdef CONFIG_NFS_V4
 /*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by IP address, protocol version, and minorversion
+ *
+ * Called from the pg_authenticate method. The callback identifier
+ * is not used as it has not been decoded.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_no_ident(const struct sockaddr *addr)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 0) == false)
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(int cb_ident)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        clp = idr_find(&cb_ident_idr, cb_ident);
+        if (clp)
+                atomic_inc(&clp->cl_count);
+        spin_unlock(&nfs_client_lock);
+        return clp;
+}
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
+ * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
+ * can arrive before the callback sessionid is set. For CB_NULL calls,
+ * find a client by IP address protocol version, and minorversion.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid, int is_cb_compound)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                if (nfs4_cb_match_client(addr, clp, 1) == false)
+                        continue;
+                if (!nfs4_has_session(clp))
+                        continue;
+                /* Match sessionid unless cb_null call*/
+                if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data,
+                    sid->data, NFS4_MAX_SESSIONID_LEN) != 0))
+                        continue;
+                atomic_inc(&clp->cl_count);
+                spin_unlock(&nfs_client_lock);
+                return clp;
+        }
+        spin_unlock(&nfs_client_lock);
+        return NULL;
+}
+#else /* CONFIG_NFS_V4_1 */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+                           struct nfs4_sessionid *sid, int is_cb_compound)
+{
+        return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+/*
 * Initialize the NFS4 callback service
 */
 static int nfs4_init_callback(struct nfs_client *clp)
@@ -1342,11 +1468,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
                server->namelen = NFS4_MAXNAMLEN;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
 out:
        nfs_free_fattr(fattr);
@@ -1551,11 +1673,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        if (error < 0)
                goto out_free_server;
-        spin_lock(&nfs_client_lock);
+        nfs_server_insert_lists(server);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
        nfs_free_fattr(fattr_fsinfo);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 1fd62fc49be3..364e4328f392 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -40,11 +40,23 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 {
        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
+/**
+ * nfs_have_delegation - check if inode has a delegation
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
 int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
        struct nfs_delegation *delegation;
@@ -119,10 +131,15 @@ again:
        return 0;
 }
-/*
+/**
- * Set up a delegation on an inode
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
 */
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+                                  struct nfs_openres *res)
 {
        struct nfs_delegation *delegation;
        struct rpc_cred *oldcred = NULL;
@@ -175,38 +192,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
        return inode;
 }
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+static struct nfs_delegation *
-                                                           const nfs4_stateid *stateid,
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-                                                           struct nfs_client *clp)
+                             struct nfs_server *server)
 {
        struct nfs_delegation *delegation =
                rcu_dereference_protected(nfsi->delegation,
-                                          lockdep_is_held(&clp->cl_lock));
+                                lockdep_is_held(&server->nfs_client->cl_lock));
        if (delegation == NULL)
                goto nomatch;
        spin_lock(&delegation->lock);
-        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
-                                sizeof(delegation->stateid.data)) != 0)
-                goto nomatch_unlock;
        list_del_rcu(&delegation->super_list);
        delegation->inode = NULL;
        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
        spin_unlock(&delegation->lock);
        return delegation;
-nomatch_unlock:
-        spin_unlock(&delegation->lock);
 nomatch:
        return NULL;
 }
-/*
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
- * Set up a delegation on an inode
+                                                    struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs_delegation *delegation;
+        spin_lock(&clp->cl_lock);
+        delegation = nfs_detach_delegation_locked(nfsi, server);
+        spin_unlock(&clp->cl_lock);
+        return delegation;
+}
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation, *old_delegation;
        struct nfs_delegation *freeme = NULL;
@@ -227,7 +258,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        spin_lock(&clp->cl_lock);
        old_delegation = rcu_dereference_protected(nfsi->delegation,
-                                                   lockdep_is_held(&clp->cl_lock));
+                                        lockdep_is_held(&clp->cl_lock));
        if (old_delegation != NULL) {
                if (memcmp(&delegation->stateid, &old_delegation->stateid,
                                        sizeof(old_delegation->stateid)) == 0 &&
@@ -246,9 +277,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                        delegation = NULL;
                        goto out;
                }
-                freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
+                freeme = nfs_detach_delegation_locked(nfsi, server);
        }
-        list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+        list_add_rcu(&delegation->super_list, &server->delegations);
        nfsi->delegation_state = delegation->type;
        rcu_assign_pointer(nfsi->delegation, delegation);
        delegation = NULL;
@@ -290,73 +321,85 @@ out:
        return err;
 }
-/*
+/**
- * Return all delegations that have been marked for return
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Returns zero on success, or a negative errno value.
 */
 int nfs_client_return_marked_delegations(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
        int err = 0;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
-                if (inode == NULL)
+                                                        &delegation->flags))
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL) {
+                                                                server);
-                        filemap_flush(inode->i_mapping);
+                        rcu_read_unlock();
-                        err = __nfs_inode_return_delegation(inode, delegation, 0);
+                        if (delegation != NULL) {
+                                filemap_flush(inode->i_mapping);
+                                err = __nfs_inode_return_delegation(inode,
+                                                                delegation, 0);
+                        }
+                        iput(inode);
+                        if (!err)
+                                goto restart;
+                        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+                        return err;
                }
-                iput(inode);
-                if (!err)
-                        goto restart;
-                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                return err;
        }
        rcu_read_unlock();
        return 0;
 }
-/*
+/**
- * This function returns the delegation without reclaiming opens
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
- * or protecting against delegation reclaims.
+ * @inode: inode to process
- * It is therefore really only safe to be called from
+ *
- * nfs4_clear_inode()
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
 */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL)
                        nfs_do_return_delegation(inode, delegation, 0);
        }
 }
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
 int nfs_inode_return_delegation(struct inode *inode)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        int err = 0;
        if (rcu_access_pointer(nfsi->delegation) != NULL) {
-                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation(nfsi, server);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
-                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
                        nfs_wb_all(inode);
                        err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -365,46 +408,61 @@ int nfs_inode_return_delegation(struct inode *inode)
        return err;
 }
-static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
 {
+        struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
 }
-/*
+/**
- * Return all delegations associated to a super block
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
 */
 void nfs_super_return_all_delegations(struct super_block *sb)
 {
-        struct nfs_client *clp = NFS_SB(sb)->nfs_client;
+        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_delegation *delegation;
        if (clp == NULL)
                return;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
-                if (delegation->inode != NULL && delegation->inode->i_sb == sb)
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
                spin_unlock(&delegation->lock);
        }
        rcu_read_unlock();
        if (nfs_client_return_marked_delegations(clp) != 0)
                nfs4_schedule_state_manager(clp);
 }
-static
+static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
-void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+                                                 fmode_t flags)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
                        continue;
                if (delegation->type & flags)
-                        nfs_mark_return_delegation(clp, delegation);
+                        nfs_mark_return_delegation(delegation);
        }
+}
+static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+                                                        fmode_t flags)
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_all_delegation_types(server, flags);
        rcu_read_unlock();
 }
@@ -419,19 +477,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
                nfs4_schedule_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
 void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
        nfs_client_mark_return_all_delegation_types(clp, flags);
        nfs_delegation_run_state_manager(clp);
 }
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
        nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
 }
-/*
+/**
- * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
+ * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
 */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
@@ -440,29 +511,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
        nfs_client_mark_return_all_delegations(clp);
 }
-static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 {
        struct nfs_delegation *delegation;
-        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
                        continue;
-                nfs_mark_return_delegation(clp, delegation);
+                nfs_mark_return_delegation(delegation);
        }
-        rcu_read_unlock();
 }
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 {
-        nfs_client_mark_return_unreferenced_delegations(clp);
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs_mark_return_unreferenced_delegations(server);
+        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
 }
-/*
+/**
- * Asynchronous delegation recall!
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information from CB_RECALL arguments
+ *
+ * Returns zero on success, or a negative errno value.
 */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
+int nfs_async_inode_return_delegation(struct inode *inode,
+                                      const nfs4_stateid *stateid)
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_delegation *delegation;
@@ -474,22 +559,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
                rcu_read_unlock();
                return -ENOENT;
        }
+        nfs_mark_return_delegation(delegation);
-        nfs_mark_return_delegation(clp, delegation);
        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
        return 0;
 }
-/*
+static struct inode *
- * Retrieve the inode associated with a delegation
+nfs_delegation_find_inode_server(struct nfs_server *server,
- */
+                                 const struct nfs_fh *fhandle)
-struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
 {
        struct nfs_delegation *delegation;
        struct inode *res = NULL;
-        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                spin_lock(&delegation->lock);
                if (delegation->inode != NULL &&
                    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -499,49 +583,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
                if (res != NULL)
                        break;
        }
+        return res;
+}
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+                                        const struct nfs_fh *fhandle)
+{
+        struct nfs_server *server;
+        struct inode *res = NULL;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                res = nfs_delegation_find_inode_server(server, fhandle);
+                if (res != NULL)
+                        break;
+        }
        rcu_read_unlock();
        return res;
 }
-/*
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
- * Mark all delegations as needing to be reclaimed
+{
+        struct nfs_delegation *delegation;
+        list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
-                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+                nfs_delegation_mark_reclaim_server(server);
        rcu_read_unlock();
 }
-/*
+/**
- * Reap all unclaimed delegations after reboot recovery is done
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
 */
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct nfs_server *server;
        struct inode *inode;
 restart:
        rcu_read_lock();
-        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
+                list_for_each_entry_rcu(delegation, &server->delegations,
-                        continue;
+                                                                super_list) {
-                inode = nfs_delegation_grab_inode(delegation);
+                        if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
-                if (inode == NULL)
+                                                &delegation->flags) == 0)
-                        continue;
+                                continue;
-                spin_lock(&clp->cl_lock);
+                        inode = nfs_delegation_grab_inode(delegation);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
+                        if (inode == NULL)
-                spin_unlock(&clp->cl_lock);
+                                continue;
-                rcu_read_unlock();
+                        delegation = nfs_detach_delegation(NFS_I(inode),
-                if (delegation != NULL)
+                                                                server);
-                        nfs_free_delegation(delegation);
+                        rcu_read_unlock();
-                iput(inode);
-                goto restart;
+                        if (delegation != NULL)
+                                nfs_free_delegation(delegation);
+                        iput(inode);
+                        goto restart;
+                }
        }
        rcu_read_unlock();
 }
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        int ret = 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                if (!list_empty(&server->delegations)) {
+                        ret = 1;
+                        break;
+                }
+        rcu_read_unlock();
+        return ret;
+}
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ *
+ * Returns one and fills in "dst->data" * if inode had a delegation,
+ * otherwise zero is returned.
+ */
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda19..d9322e490c56 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d33da530097a..abe4f0c8dc5f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,8 +33,8 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-#include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
+#include <linux/xattr.h>
 #include "delegation.h"
 #include "iostat.h"
@@ -125,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 #endif /* CONFIG_NFS_V4 */
@@ -172,7 +173,7 @@ struct nfs_cache_array {
        struct nfs_cache_array_entry array[0];
 };
-typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
@@ -378,14 +379,14 @@ error:
        return error;
 }
-/* Fill in an entry based on the xdr code stored in desc->page */
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
-static
+                      struct nfs_entry *entry, struct xdr_stream *xdr)
-int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
 {
-        __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
+        int error;
-        if (IS_ERR(p))
-                return PTR_ERR(p);
+        error = desc->decode(xdr, entry, desc->plus);
+        if (error)
+                return error;
        entry->fattr->time_start = desc->timestamp;
        entry->fattr->gencount = desc->gencount;
        return 0;
@@ -459,25 +460,26 @@ out:
 /* Perform conversion from xdr to cache array */
 static
 int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-                                void *xdr_page, struct page *page, unsigned int buflen)
+                                struct page **xdr_pages, struct page *page, unsigned int buflen)
 {
        struct xdr_stream stream;
-        struct xdr_buf buf;
+        struct xdr_buf buf = {
-        __be32 *ptr = xdr_page;
+                .pages = xdr_pages,
+                .page_len = buflen,
+                .buflen = buflen,
+                .len = buflen,
+        };
+        struct page *scratch;
        struct nfs_cache_array *array;
        unsigned int count = 0;
        int status;
-        buf.head->iov_base = xdr_page;
+        scratch = alloc_page(GFP_KERNEL);
-        buf.head->iov_len = buflen;
+        if (scratch == NULL)
-        buf.tail->iov_len = 0;
+                return -ENOMEM;
-        buf.page_base = 0;
-        buf.page_len = 0;
-        buf.buflen = buf.head->iov_len;
-        buf.len = buf.head->iov_len;
-        xdr_init_decode(&stream, &buf, ptr);
+        xdr_init_decode(&stream, &buf, NULL);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
        do {
                status = xdr_decode(desc, entry, &stream);
@@ -506,6 +508,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
                } else
                        status = PTR_ERR(array);
        }
+        put_page(scratch);
        return status;
 }
@@ -521,7 +525,6 @@ static
 void nfs_readdir_free_large_page(void *ptr, struct page **pages,
                unsigned int npages)
 {
-        vm_unmap_ram(ptr, npages);
        nfs_readdir_free_pagearray(pages, npages);
 }
@@ -530,9 +533,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
 * to nfs_readdir_free_large_page
 */
 static
-void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-        void *ptr;
        unsigned int i;
        for (i = 0; i < npages; i++) {
@@ -541,13 +543,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
                        goto out_freepages;
                pages[i] = page;
        }
+        return 0;
-        ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
-        if (!IS_ERR_OR_NULL(ptr))
-                return ptr;
 out_freepages:
        nfs_readdir_free_pagearray(pages, i);
-        return NULL;
+        return -ENOMEM;
 }
 static
@@ -566,6 +566,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        entry.eof = 0;
        entry.fh = nfs_alloc_fhandle();
        entry.fattr = nfs_alloc_fattr();
+        entry.server = NFS_SERVER(inode);
        if (entry.fh == NULL || entry.fattr == NULL)
                goto out;
@@ -577,8 +578,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        memset(array, 0, sizeof(struct nfs_cache_array));
        array->eof_index = -1;
-        pages_ptr = nfs_readdir_large_page(pages, array_size);
+        status = nfs_readdir_large_page(pages, array_size);
-        if (!pages_ptr)
+        if (status < 0)
                goto out_release_array;
        do {
                unsigned int pglen;
@@ -587,7 +588,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
                if (status < 0)
                        break;
                pglen = status;
-                status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
+                status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
                if (status < 0) {
                        if (status == -ENOSPC)
                                status = 0;
@@ -1221,7 +1222,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                goto out_unblock_sillyrename;
        }
        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
-        res = (struct dentry *)inode;
+        res = ERR_CAST(inode);
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -1355,8 +1356,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
                attr.ia_valid = ATTR_MODE;
-                if (!IS_POSIXACL(dir))
+                attr.ia_mode &= ~current_umask();
-                        attr.ia_mode &= ~current_umask();
        } else {
                open_flags &= ~(O_EXCL | O_CREAT);
                attr.ia_valid = 0;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 4e2d9b6b1380..18696882f1c6 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -238,7 +238,7 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
 }
-#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
+#else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 017daa3bed38..ce00b704452c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1410,9 +1410,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 */
 void nfs4_evict_inode(struct inode *inode)
 {
+        pnfs_destroy_layout(NFS_I(inode));
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
-        pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
@@ -1619,6 +1619,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
+        nfs_cleanup_cb_ident_idr();
        unregister_nfs_fs();
        nfs_fs_proc_exit();
        nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e6356b750b77..bfa3a34af801 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -128,9 +128,13 @@ extern void nfs_umount(const struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
+extern void nfs_cleanup_cb_ident_idr(void);
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
+extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
-extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *,
+                           int);
 extern struct nfs_server *nfs_create_server(
                                        const struct nfs_parsed_mount_data *,
                                        struct nfs_fh *);
@@ -185,17 +189,20 @@ extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
-extern int nfs_stat_to_errno(int);
+extern int nfs_stat_to_errno(enum nfs_stat);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs2_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs3_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
+extern int nfs4_decode_dirent(struct xdr_stream *,
+                                struct nfs_entry *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 4f981f1f6689..d4c2d6b7507e 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -236,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info)
                .authflavor     = RPC_AUTH_UNIX,
                .flags          = RPC_CLNT_CREATE_NOPING,
        };
-        struct mountres result;
        struct rpc_message msg  = {
                .rpc_argp       = info->dirpath,
-                .rpc_resp       = &result,
        };
        struct rpc_clnt *clnt;
        int status;
@@ -248,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        clnt = rpc_create(&args);
-        if (unlikely(IS_ERR(clnt)))
+        if (IS_ERR(clnt))
                goto out_clnt_err;
        dprintk("NFS: sending UMNT request for %s:%s\n",
@@ -280,29 +278,20 @@ out_call_err:
 * XDR encode/decode functions for MOUNT
 */
-static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
 {
        const u32 pathname_len = strlen(pathname);
        __be32 *p;
-        if (unlikely(pathname_len > MNTPATHLEN))
+        BUG_ON(pathname_len > MNTPATHLEN);
-                return -EIO;
+        p = xdr_reserve_space(xdr, 4 + pathname_len);
-        p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
-        if (unlikely(p == NULL))
-                return -EIO;
        xdr_encode_opaque(p, pathname, pathname_len);
-        return 0;
 }
-static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p,
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
-                           const char *dirpath)
+                                const char *dirpath)
 {
-        struct xdr_stream xdr;
+        encode_mntdirpath(xdr, dirpath);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        return encode_mntdirpath(&xdr, dirpath);
 }
 /*
@@ -320,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
                if (mnt_errtbl[i].status == status) {
@@ -351,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
        return 0;
 }
-static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
-                            struct mountres *res)
+                                struct xdr_stream *xdr,
+                                struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_status(xdr, res);
-        status = decode_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        return decode_fhandle(&xdr, res);
+        return decode_fhandle(xdr, res);
 }
 static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -371,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
        u32 status;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(status));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        status = ntohl(*p);
+        status = be32_to_cpup(p);
        for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
                if (mnt3_errtbl[i].status == status) {
@@ -394,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
        u32 size;
        __be32 *p;
-        p = xdr_inline_decode(xdr, sizeof(size));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        size = ntohl(*p++);
+        size = be32_to_cpup(p);
        if (size > NFS3_FHSIZE || size == 0)
                return -EIO;
@@ -421,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        if (*count == 0)
                return 0;
-        p = xdr_inline_decode(xdr, sizeof(entries));
+        p = xdr_inline_decode(xdr, 4);
        if (unlikely(p == NULL))
                return -EIO;
-        entries = ntohl(*p);
+        entries = be32_to_cpup(p);
        dprintk("NFS: received %u auth flavors\n", entries);
        if (entries > NFS_MAX_SECFLAVORS)
                entries = NFS_MAX_SECFLAVORS;
-        p = xdr_inline_decode(xdr, sizeof(u32) * entries);
+        p = xdr_inline_decode(xdr, 4 * entries);
        if (unlikely(p == NULL))
                return -EIO;
@@ -437,7 +424,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
                entries = *count;
        for (i = 0; i < entries; i++) {
-                flavors[i] = ntohl(*p++);
+                flavors[i] = be32_to_cpup(p++);
                dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
        }
        *count = i;
@@ -445,30 +432,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        return 0;
 }
-static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p,
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
-                             struct mountres *res)
+                                 struct xdr_stream *xdr,
+                                 struct mountres *res)
 {
-        struct xdr_stream xdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_fhs_status(xdr, res);
-        status = decode_fhs_status(&xdr, res);
        if (unlikely(status != 0 || res->errno != 0))
                return status;
-        status = decode_fhandle3(&xdr, res);
+        status = decode_fhandle3(xdr, res);
        if (unlikely(status != 0)) {
                res->errno = -EBADHANDLE;
                return 0;
        }
-        return decode_auth_flavors(&xdr, res);
+        return decode_auth_flavors(xdr, res);
 }
 static struct rpc_procinfo mnt_procedures[] = {
        [MOUNTPROC_MNT] = {
                .p_proc         = MOUNTPROC_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres_sz,
                .p_statidx      = MOUNTPROC_MNT,
@@ -476,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = {
        },
        [MOUNTPROC_UMNT] = {
                .p_proc         = MOUNTPROC_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC_UMNT,
                .p_name         = "UMOUNT",
@@ -486,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = {
 static struct rpc_procinfo mnt3_procedures[] = {
        [MOUNTPROC3_MNT] = {
                .p_proc         = MOUNTPROC3_MNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
-                .p_decode       = (kxdrproc_t)mnt_dec_mountres3,
+                .p_decode       = (kxdrdproc_t)mnt_xdr_dec_mountres3,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_replen       = MNT_dec_mountres3_sz,
                .p_statidx      = MOUNTPROC3_MNT,
@@ -495,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
        },
        [MOUNTPROC3_UMNT] = {
                .p_proc         = MOUNTPROC3_UMNT,
-                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_encode       = (kxdreproc_t)mnt_xdr_enc_dirpath,
                .p_arglen       = MNT_enc_dirpath_sz,
                .p_statidx      = MOUNTPROC3_UMNT,
                .p_name         = "UMOUNT",
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5914a1911c95..792cb13a4304 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,584 +61,1008 @@
 #define NFS_readdirres_sz       (1)
 #define NFS_statfsres_sz        (1+NFS_info_sz)
 /*
- * Common NFS XDR functions as inlines
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
 */
-static inline __be32 *
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle)
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
 {
-        memcpy(p, fhandle->data, NFS2_FHSIZE);
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
 }
-static inline __be32 *
+/*
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-        /* NFSv2 handles have a fixed length */
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-        fhandle->size = NFS2_FHSIZE;
+                "Remaining buffer length is %tu words.\n",
-        memcpy(fhandle->data, p, NFS2_FHSIZE);
+                func, xdr->end - xdr->p);
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
+}
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+/*
+ *      typedef opaque  nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+{
+        u32 recvd, count;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = 0;        /* NFSv2 does not pass EOF flag on the wire. */
+        result->count = count;
+        return count;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ *      enum stat {
+ *              NFS_OK = 0,
+ *              NFSERR_PERM = 1,
+ *              NFSERR_NOENT = 2,
+ *              NFSERR_IO = 5,
+ *              NFSERR_NXIO = 6,
+ *              NFSERR_ACCES = 13,
+ *              NFSERR_EXIST = 17,
+ *              NFSERR_NODEV = 19,
+ *              NFSERR_NOTDIR = 20,
+ *              NFSERR_ISDIR = 21,
+ *              NFSERR_FBIG = 27,
+ *              NFSERR_NOSPC = 28,
+ *              NFSERR_ROFS = 30,
+ *              NFSERR_NAMETOOLONG = 63,
+ *              NFSERR_NOTEMPTY = 66,
+ *              NFSERR_DQUOT = 69,
+ *              NFSERR_STALE = 70,
+ *              NFSERR_WFLUSH = 99
+ *      };
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32*
+/*
-xdr_encode_time(__be32 *p, struct timespec *timep)
+ * 2.3.2.  ftype
+ *
+ *      enum ftype {
+ *              NFNON = 0,
+ *              NFREG = 1,
+ *              NFDIR = 2,
+ *              NFBLK = 3,
+ *              NFCHR = 4,
+ *              NFLNK = 5
+ *      };
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
 {
-        *p++ = htonl(timep->tv_sec);
+        *type = be32_to_cpup(p++);
-        /* Convert nanoseconds into microseconds */
+        if (unlikely(*type > NF2FIFO))
-        *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0);
+                *type = NFBAD;
        return p;
 }
-static inline __be32*
+/*
-xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
+ * 2.3.3.  fhandle
+ *
+ *      typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
-        /*
+        __be32 *p;
-         * Passing the invalid value useconds=1000000 is a
-         * Sun convention for "set to current server time".
+        BUG_ON(fh->size != NFS2_FHSIZE);
-         * It's needed to make permissions checks for the
+        p = xdr_reserve_space(xdr, NFS2_FHSIZE);
-         * "touch" program across v2 mounts to Solaris and
+        memcpy(p, fh->data, NFS2_FHSIZE);
-         * Irix boxes work correctly. See description of
+}
-         * sattr in section 6.1 of "NFS Illustrated" by
-         * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
-         */
+{
-        *p++ = htonl(timep->tv_sec);
+        __be32 *p;
-        *p++ = htonl(1000000);
+        p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fh->size = NFS2_FHSIZE;
+        memcpy(fh->data, p, NFS2_FHSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * 2.3.4.  timeval
+ *
+ *      struct timeval {
+ *              unsigned int seconds;
+ *              unsigned int useconds;
+ *      };
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        if (timep->tv_nsec != 0)
+                *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+        else
+                *p++ = cpu_to_be32(0);
        return p;
 }
-static inline __be32*
+/*
-xdr_decode_time(__be32 *p, struct timespec *timep)
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time".  It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly.  See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+                                              const struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        *p++ = cpu_to_be32(timep->tv_sec);
-        /* Convert microseconds into nanoseconds */
+        *p++ = cpu_to_be32(1000000);
-        timep->tv_nsec = ntohl(*p++) * 1000;
        return p;
 }
-static __be32 *
+static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
+{
+        timep->tv_sec = be32_to_cpup(p++);
+        timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
+        return p;
+}
+/*
+ * 2.3.5.  fattr
+ *
+ *      struct fattr {
+ *              ftype           type;
+ *              unsigned int    mode;
+ *              unsigned int    nlink;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              unsigned int    blocksize;
+ *              unsigned int    rdev;
+ *              unsigned int    blocks;
+ *              unsigned int    fsid;
+ *              unsigned int    fileid;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *              timeval         ctime;
+ *      };
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
        u32 rdev, type;
-        type = ntohl(*p++);
+        __be32 *p;
-        fattr->mode = ntohl(*p++);
-        fattr->nlink = ntohl(*p++);
+        p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
-        fattr->uid = ntohl(*p++);
+        if (unlikely(p == NULL))
-        fattr->gid = ntohl(*p++);
+                goto out_overflow;
-        fattr->size = ntohl(*p++);
-        fattr->du.nfs2.blocksize = ntohl(*p++);
-        rdev = ntohl(*p++);
-        fattr->du.nfs2.blocks = ntohl(*p++);
-        fattr->fsid.major = ntohl(*p++);
-        fattr->fsid.minor = 0;
-        fattr->fileid = ntohl(*p++);
-        p = xdr_decode_time(p, &fattr->atime);
-        p = xdr_decode_time(p, &fattr->mtime);
-        p = xdr_decode_time(p, &fattr->ctime);
        fattr->valid |= NFS_ATTR_FATTR_V2;
+        p = xdr_decode_ftype(p, &type);
+        fattr->mode = be32_to_cpup(p++);
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        fattr->size = be32_to_cpup(p++);
+        fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+        rdev = be32_to_cpup(p++);
        fattr->rdev = new_decode_dev(rdev);
-        if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
+        if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
                fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
                fattr->rdev = 0;
        }
+        fattr->du.nfs2.blocks = be32_to_cpup(p++);
+        fattr->fsid.major = be32_to_cpup(p++);
+        fattr->fsid.minor = 0;
+        fattr->fileid = be32_to_cpup(p++);
+        p = xdr_decode_time(p, &fattr->atime);
+        p = xdr_decode_time(p, &fattr->mtime);
+        xdr_decode_time(p, &fattr->ctime);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * 2.3.6.  sattr
+ *
+ *      struct sattr {
+ *              unsigned int    mode;
+ *              unsigned int    uid;
+ *              unsigned int    gid;
+ *              unsigned int    size;
+ *              timeval         atime;
+ *              timeval         mtime;
+ *      };
+ */
+#define NFS2_SATTR_NOT_SET      (0xffffffff)
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
        return p;
 }
-static inline __be32 *
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
-        const __be32 not_set = __constant_htonl(0xFFFFFFFF);
+        __be32 *p;
-        *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
+        p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
-        *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
-        *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
-        *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
-        if (attr->ia_valid & ATTR_ATIME_SET) {
+        if (attr->ia_valid & ATTR_MODE)
+                *p++ = cpu_to_be32(attr->ia_mode);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_UID)
+                *p++ = cpu_to_be32(attr->ia_uid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_GID)
+                *p++ = cpu_to_be32(attr->ia_gid);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_SIZE)
+                *p++ = cpu_to_be32((u32)attr->ia_size);
+        else
+                *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+        if (attr->ia_valid & ATTR_ATIME_SET)
                p = xdr_encode_time(p, &attr->ia_atime);
-        } else if (attr->ia_valid & ATTR_ATIME) {
+        else if (attr->ia_valid & ATTR_ATIME)
                p = xdr_encode_current_server_time(p, &attr->ia_atime);
-        } else {
+        else
-                *p++ = not_set;
+                p = xdr_time_not_set(p);
-                *p++ = not_set;
+        if (attr->ia_valid & ATTR_MTIME_SET)
-        }
+                xdr_encode_time(p, &attr->ia_mtime);
+        else if (attr->ia_valid & ATTR_MTIME)
-        if (attr->ia_valid & ATTR_MTIME_SET) {
+                xdr_encode_current_server_time(p, &attr->ia_mtime);
-                p = xdr_encode_time(p, &attr->ia_mtime);
+        else
-        } else if (attr->ia_valid & ATTR_MTIME) {
+                xdr_time_not_set(p);
-                p = xdr_encode_current_server_time(p, &attr->ia_mtime);
-        } else {
-                *p++ = not_set; 
-                *p++ = not_set;
-        }
-        return p;
 }
 /*
- * NFS encode functions
+ * 2.3.7.  filename
+ *
+ *      typedef string filename<MAXNAMLEN>;
 */
+static void encode_filename(struct xdr_stream *xdr,
+                            const char *name, u32 length)
+{
+        __be32 *p;
+        BUG_ON(length > NFS2_MAXNAMLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
+}
+static int decode_filename_inline(struct xdr_stream *xdr,
+                                  const char **name, u32 *length)
+{
+        __be32 *p;
+        u32 count;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *name = (const char *)p;
+        *length = count;
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned filename too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 /*
- * Encode file handle argument
+ * 2.3.8.  path
- * GETATTR, READLINK, STATFS
+ *
+ *      typedef string path<MAXPATHLEN>;
 */
-static int
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
-nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
 {
-        p = xdr_encode_fhandle(p, fh);
+        __be32 *p;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        BUG_ON(length > NFS2_MAXPATHLEN);
+        p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(length);
+        xdr_write_pages(xdr, pages, 0, length);
+}
+static int decode_path(struct xdr_stream *xdr)
+{
+        u32 length, recvd;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p);
+        if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+                goto out_size;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(length > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, length);
+        xdr_terminate_string(xdr->buf, length);
        return 0;
+out_size:
+        dprintk("NFS: returned pathname too long: %u\n", length);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "length %u > received %u\n", length, recvd);
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /*
- * Encode SETATTR arguments
+ * 2.3.9.  attrstat
+ *
+ *      union attrstat switch (stat status) {
+ *      case NFS_OK:
+ *              fattr attributes;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
-nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        enum nfs_stat status;
-        p = xdr_encode_sattr(p, args->sattr);
+        int error;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Encode directory ops argument
+ * 2.3.10.  diropargs
- * LOOKUP, RMDIR
+ *
+ *      struct diropargs {
+ *              fhandle  dir;
+ *              filename name;
+ *      };
 */
-static int
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
-nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
+                             const char *name, u32 length)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, fh);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_filename(xdr, name, length);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode REMOVE argument
+ * 2.3.11.  diropres
+ *
+ *      union diropres switch (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      fhandle file;
+ *                      fattr   attributes;
+ *              } diropok;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
-nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        int error;
-        p = xdr_encode_array(p, args->name.name, args->name.len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        error = decode_fhandle(xdr, result->fh);
-        return 0;
+        if (unlikely(error))
+                goto out;
+        error = decode_fattr(xdr, result->fattr);
+out:
+        return error;
+}
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_diropok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * NFSv2 XDR encode functions
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+                                 struct xdr_stream *xdr,
+                                 const struct nfs_fh *fh)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_fhandle(xdr, fh);
-        unsigned int replen;
+}
-        u32 offset = (u32)args->offset;
+/*
+ * 2.2.3.  sattrargs
+ *
+ *      struct sattrargs {
+ *              fhandle file;
+ *              sattr attributes;
+ *      };
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_sattrargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        encode_sattr(xdr, args->sattr);
+}
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_diropargs *args)
+{
+        encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs_readlinkargs *args)
+{
+        encode_fhandle(xdr, args->fh);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->pglen, NFS_readlinkres_sz);
+}
+/*
+ * 2.2.7.  readargs
+ *
+ *      struct readargs {
+ *              fhandle file;
+ *              unsigned offset;
+ *              unsigned count;
+ *              unsigned totalcount;
+ *      };
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+                            const struct nfs_readargs *args)
+{
+        u32 offset = args->offset;
        u32 count = args->count;
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_fhandle(xdr, args->fh);
-        *p++ = htonl(offset);
-        *p++ = htonl(count);
-        *p++ = htonl(count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
+        p = xdr_reserve_space(xdr, 4 + 4 + 4);
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2;
+        *p++ = cpu_to_be32(offset);
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+        *p++ = cpu_to_be32(count);
-                         args->pages, args->pgbase, count);
+        *p = cpu_to_be32(count);
+}
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+                                  struct xdr_stream *xdr,
+                                  const struct nfs_readargs *args)
+{
+        encode_readargs(xdr, args);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->count, NFS_readres_sz);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
 }
 /*
- * Decode READ reply
+ * 2.2.9.  writeargs
+ *
+ *      struct writeargs {
+ *              fhandle file;
+ *              unsigned beginoffset;
+ *              unsigned offset;
+ *              unsigned totalcount;
+ *              nfsdata data;
+ *      };
 */
-static int
+static void encode_writeargs(struct xdr_stream *xdr,
-nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+                             const struct nfs_writeargs *args)
 {
-        struct kvec *iov = req->rq_rcv_buf.head;
+        u32 offset = args->offset;
-        size_t hdrlen;
+        u32 count = args->count;
-        u32 count, recvd;
+        __be32 *p;
-        int status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_fattr(p, res->fattr);
-        count = ntohl(*p++);
-        res->eof = 0;
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READ reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        encode_fhandle(xdr, args->fh);
-        if (count > recvd) {
-                dprintk("NFS: server cheating in read reply: "
-                        "count %u > recvd %u\n", count, recvd);
-                count = recvd;
-        }
-        dprintk("RPC:      readres OK count %u\n", count);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
-        if (count < res->count)
+        *p++ = cpu_to_be32(offset);
-                res->count = count;
+        *p++ = cpu_to_be32(offset);
+        *p++ = cpu_to_be32(count);
-        return count;
+        /* nfsdata */
+        *p = cpu_to_be32(count);
+        xdr_write_pages(xdr, args->pages, args->pgbase, count);
 }
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs_writeargs *args)
+{
+        encode_writeargs(xdr, args);
+        xdr->buf->flags |= XDRBUF_WRITE;
+}
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 2.2.10.  createargs
+ *
+ *      struct createargs {
+ *              diropargs where;
+ *              sattr attributes;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
-nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_createargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        encode_diropargs(xdr, args->fh, args->name, args->len);
-        u32 offset = (u32)args->offset;
+        encode_sattr(xdr, args->sattr);
-        u32 count = args->count;
+}
-        p = xdr_encode_fhandle(p, args->fh);
-        *p++ = htonl(offset);
-        *p++ = htonl(offset);
-        *p++ = htonl(count);
-        *p++ = htonl(count);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        /* Copy the page array */
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+                                    struct xdr_stream *xdr,
-        sndbuf->flags |= XDRBUF_WRITE;
+                                    const struct nfs_removeargs *args)
-        return 0;
+{
+        encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
 }
 /*
- * Encode create arguments
+ * 2.2.12.  renameargs
- * CREATE, MKDIR
+ *
+ *      struct renameargs {
+ *              diropargs from;
+ *              diropargs to;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
-nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        const struct qstr *old = args->old_name;
-        p = xdr_encode_array(p, args->name, args->len);
+        const struct qstr *new = args->new_name;
-        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        encode_diropargs(xdr, args->old_dir, old->name, old->len);
-        return 0;
+        encode_diropargs(xdr, args->new_dir, new->name, new->len);
 }
 /*
- * Encode RENAME arguments
+ * 2.2.13.  linkargs
+ *
+ *      struct linkargs {
+ *              fhandle from;
+ *              diropargs to;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
-nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+                                  struct xdr_stream *xdr,
+                                  const struct nfs_linkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->old_dir);
+        encode_fhandle(xdr, args->fromfh);
-        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+        encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
-        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode LINK arguments
+ * 2.2.14.  symlinkargs
+ *
+ *      struct symlinkargs {
+ *              diropargs from;
+ *              path to;
+ *              sattr attributes;
+ *      };
 */
-static int
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
-nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_symlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
-        p = xdr_encode_fhandle(p, args->tofh);
+        encode_path(xdr, args->pages, args->pathlen);
-        p = xdr_encode_array(p, args->toname, args->tolen);
+        encode_sattr(xdr, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SYMLINK arguments
+ * 2.2.17.  readdirargs
+ *
+ *      struct readdirargs {
+ *              fhandle dir;
+ *              nfscookie cookie;
+ *              unsigned count;
+ *      };
 */
-static int
+static void encode_readdirargs(struct xdr_stream *xdr,
-nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
+                               const struct nfs_readdirargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        __be32 *p;
-        size_t pad;
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_fhandle(xdr, args->fh);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
-        *p++ = htonl(args->pathlen);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
-        xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen);
+        p = xdr_reserve_space(xdr, 4 + 4);
+        *p++ = cpu_to_be32(args->cookie);
+        *p = cpu_to_be32(args->count);
+}
-        /*
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
-         * xdr_encode_pages may have added a few bytes to ensure the
+                                     struct xdr_stream *xdr,
-         * pathname ends on a 4-byte boundary.  Start encoding the
+                                     const struct nfs_readdirargs *args)
-         * attributes after the pad bytes.
+{
-         */
+        encode_readdirargs(xdr, args);
-        pad = sndbuf->tail->iov_len;
+        prepare_reply_buffer(req, args->pages, 0,
-        if (pad > 0)
+                                        args->count, NFS_readdirres_sz);
-                p++;
-        p = xdr_encode_sattr(p, args->sattr);
-        sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
-        return 0;
 }
 /*
- * Encode arguments to readdir call
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
 */
-static int
-nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             void *__unused)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        enum nfs_stat status;
-        unsigned int replen;
+        int error;
-        u32 count = args->count;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        p = xdr_encode_fhandle(p, args->fh);
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
-        *p++ = htonl(args->cookie);
+                                 struct nfs_fattr *result)
-        *p++ = htonl(count); /* see above */
+{
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        return decode_attrstat(xdr, result);
+}
-        /* Inline the page array */
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2;
+                                 struct nfs_diropok *result)
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
+{
-        return 0;
+        return decode_diropres(xdr, result);
 }
 /*
- * Decode the result of a readdir call.
+ * 2.2.6.  readlinkres
- * We're not really decoding anymore, we just leave the buffer untouched
+ *
- * and only check that it is syntactically correct.
+ *      union readlinkres switch (stat status) {
- * The real decoding happens in nfs_decode_entry below, called directly
+ *      case NFS_OK:
- * from nfs_readdir for each entry.
+ *              path data;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
-nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
+                                    struct xdr_stream *xdr, void *__unused)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        enum nfs_stat status;
-        struct kvec *iov = rcvbuf->head;
+        int error;
-        struct page **page;
-        size_t hdrlen;
+        error = decode_stat(xdr, &status);
-        unsigned int pglen, recvd;
+        if (unlikely(error))
-        int status;
+                goto out;
+        if (status != NFS_OK)
-        if ((status = ntohl(*p++)))
+                goto out_default;
-                return nfs_stat_to_errno(status);
+        error = decode_path(xdr);
+out:
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        return error;
-        if (iov->iov_len < hdrlen) {
+out_default:
-                dprintk("NFS: READDIR reply header overflowed:"
+        return nfs_stat_to_errno(status);
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+}
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
+/*
-        recvd = rcvbuf->len - hdrlen;
+ * 2.2.7.  readres
-        if (pglen > recvd)
+ *
-                pglen = recvd;
+ *      union readres switch (stat status) {
-        page = rcvbuf->pages;
+ *      case NFS_OK:
-        return pglen;
+ *              fattr attributes;
+ *              nfsdata data;
+ *      default:
+ *              void;
+ *      };
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs_readres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_fattr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_nfsdata(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_writeres *result)
 {
-        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+        /* All NFSv2 writes are "file sync" writes */
-                "Remaining buffer length is %tu words.\n",
+        result->verf->committed = NFS_FILE_SYNC;
-                func, xdr->end - xdr->p);
+        return decode_attrstat(xdr, result->fattr);
 }
-__be32 *
+/**
-nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17.  entry
+ *
+ *      struct entry {
+ *              unsigned        fileid;
+ *              filename        name;
+ *              nfscookie       cookie;
+ *              entry           *nextentry;
+ *      };
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
 {
        __be32 *p;
+        int error;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
-        if (!ntohl(*p++)) {
+        if (*p++ == xdr_zero) {
                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
+                if (unlikely(p == NULL))
                        goto out_overflow;
-                if (!ntohl(*p++))
+                if (*p++ == xdr_zero)
-                        return ERR_PTR(-EAGAIN);
+                        return -EAGAIN;
                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
+                return -EBADCOOKIE;
        }
-        p = xdr_inline_decode(xdr, 8);
+        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
+        entry->ino = be32_to_cpup(p);
-        entry->ino        = ntohl(*p++);
+        error = decode_filename_inline(xdr, &entry->name, &entry->len);
-        entry->len        = ntohl(*p++);
+        if (unlikely(error))
+                return error;
-        p = xdr_inline_decode(xdr, entry->len + 4);
+        /*
-        if (unlikely(!p))
+         * The type (size and byte order) of nfscookie isn't defined in
+         * RFC 1094.  This implementation assumes that it's an XDR uint32.
+         */
+        entry->prev_cookie = entry->cookie;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
                goto out_overflow;
-        entry->name       = (const char *) p;
+        entry->cookie = be32_to_cpup(p);
-        p                += XDR_QUADLEN(entry->len);
-        entry->prev_cookie        = entry->cookie;
-        entry->cookie     = ntohl(*p++);
        entry->d_type = DT_UNKNOWN;
-        p = xdr_inline_peek(xdr, 8);
+        return 0;
-        if (p != NULL)
-                entry->eof = !p[0] && p[1];
-        else
-                entry->eof = 0;
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EAGAIN);
+        return -EAGAIN;
-}
-/*
- * NFS XDR decode functions
- */
-/*
- * Decode simple status reply
- */
-static int
-nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
-{
-        int     status;
-        if ((status = ntohl(*p++)) != 0)
-                status = nfs_stat_to_errno(status);
-        return status;
 }
 /*
- * Decode attrstat reply
+ * 2.2.17.  readdirres
- * GETATTR, SETATTR, WRITE
+ *
- */
+ *      union readdirres switch (stat status) {
-static int
+ *      case NFS_OK:
-nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+ *              struct {
-{
+ *                      entry *entries;
-        int     status;
+ *                      bool eof;
+ *              } readdirok;
-        if ((status = ntohl(*p++)))
+ *      default:
-                return nfs_stat_to_errno(status);
+ *              void;
-        xdr_decode_fattr(p, fattr);
+ *      };
-        return 0;
+ *
-}
+ * Read the directory contents into the page cache, but don't
+ * touch them.  The actual decoding is done by nfs2_decode_dirent()
-/*
+ * during subsequent nfs_readdir() calls.
- * Decode diropres reply
- * LOOKUP, CREATE, MKDIR
 */
-static int
+static int decode_readdirok(struct xdr_stream *xdr)
-nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 {
-        int     status;
+        u32 recvd, pglen;
+        size_t hdrlen;
-        if ((status = ntohl(*p++)))
+        pglen = xdr->buf->page_len;
-                return nfs_stat_to_errno(status);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        p = xdr_decode_fhandle(p, res->fh);
+        recvd = xdr->buf->len - hdrlen;
-        xdr_decode_fattr(p, res->fattr);
+        if (unlikely(pglen > recvd))
-        return 0;
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
 }
-/*
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
- * Encode READLINK args
+                                   struct xdr_stream *xdr, void *__unused)
- */
-static int
-nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        enum nfs_stat status;
-        unsigned int replen;
+        int error;
-        p = xdr_encode_fhandle(p, args->fh);
+        error = decode_stat(xdr, &status);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (unlikely(error))
+                goto out;
-        /* Inline the page array */
+        if (status != NFS_OK)
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2;
+                goto out_default;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
+        error = decode_readdirok(xdr);
-        return 0;
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READLINK reply
+ * 2.2.18.  statfsres
+ *
+ *      union statfsres (stat status) {
+ *      case NFS_OK:
+ *              struct {
+ *                      unsigned tsize;
+ *                      unsigned bsize;
+ *                      unsigned blocks;
+ *                      unsigned bfree;
+ *                      unsigned bavail;
+ *              } info;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
-nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        size_t hdrlen;
-        u32 len, recvd;
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        /* Convert length of symlink */
-        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len) {
-                dprintk("nfs: server returned giant symlink!\n");
-                return -ENAMETOOLONG;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READLINK reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        recvd = req->rq_rcv_buf.len - hdrlen;
-        if (recvd < len) {
-                dprintk("NFS: server cheating in readlink reply: "
-                                "count %u > recvd %u\n", len, recvd);
-                return -EIO;
-        }
-        xdr_terminate_string(rcvbuf, len);
+        p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->tsize  = be32_to_cpup(p++);
+        result->bsize  = be32_to_cpup(p++);
+        result->blocks = be32_to_cpup(p++);
+        result->bfree  = be32_to_cpup(p++);
+        result->bavail = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-/*
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
- * Decode WRITE reply
+                                  struct nfs2_fsstat *result)
- */
-static int
-nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
-        res->verf->committed = NFS_FILE_SYNC;
+        enum nfs_stat status;
-        return nfs_xdr_attrstat(req, p, res->fattr);
+        int error;
+        error = decode_stat(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS_OK)
+                goto out_default;
+        error = decode_info(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-/*
- * Decode STATFS reply
- */
-static int
-nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
-{
-        int     status;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
-        res->tsize  = ntohl(*p++);
-        res->bsize  = ntohl(*p++);
-        res->blocks = ntohl(*p++);
-        res->bfree  = ntohl(*p++);
-        res->bavail = ntohl(*p++);
-        return 0;
-}
 /*
 * We need to translate between nfs status return values and
 * the local errno values which may not be the same.
 */
-static struct {
+static const struct {
        int stat;
        int errno;
 } nfs_errtbl[] = {
@@ -678,28 +1102,30 @@ static struct {
        { -1,                   -EIO            }
 };
-/*
+/**
- * Convert an NFS error code to a local one.
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
- * This one is used jointly by NFSv2 and NFSv3.
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
 */
-int
+int nfs_stat_to_errno(enum nfs_stat status)
-nfs_stat_to_errno(int stat)
 {
        int i;
        for (i = 0; nfs_errtbl[i].stat != -1; i++) {
-                if (nfs_errtbl[i].stat == stat)
+                if (nfs_errtbl[i].stat == (int)status)
                        return nfs_errtbl[i].errno;
        }
-        dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat);
+        dprintk("NFS: Unrecognized nfs status value: %u\n", status);
        return nfs_errtbl[i].errno;
 }
 #define PROC(proc, argtype, restype, timer)                             \
 [NFSPROC_##proc] = {                                                    \
        .p_proc     =  NFSPROC_##proc,                                  \
-        .p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,                  \
+        .p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,             \
-        .p_decode   =  (kxdrproc_t) nfs_xdr_##restype,                  \
+        .p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,             \
        .p_arglen   =  NFS_##argtype##_sz,                              \
        .p_replen   =  NFS_##restype##_sz,                              \
        .p_timer    =  timer,                                           \
@@ -707,21 +1133,21 @@ nfs_stat_to_errno(int stat)
        .p_name     =  #proc,                                           \
        }
 struct rpc_procinfo     nfs_procedures[] = {
-    PROC(GETATTR,       fhandle,        attrstat, 1),
+        PROC(GETATTR,   fhandle,        attrstat,       1),
-    PROC(SETATTR,       sattrargs,      attrstat, 0),
+        PROC(SETATTR,   sattrargs,      attrstat,       0),
-    PROC(LOOKUP,        diropargs,      diropres, 2),
+        PROC(LOOKUP,    diropargs,      diropres,       2),
-    PROC(READLINK,      readlinkargs,   readlinkres, 3),
+        PROC(READLINK,  readlinkargs,   readlinkres,    3),
-    PROC(READ,          readargs,       readres, 3),
+        PROC(READ,      readargs,       readres,        3),
-    PROC(WRITE,         writeargs,      writeres, 4),
+        PROC(WRITE,     writeargs,      writeres,       4),
-    PROC(CREATE,        createargs,     diropres, 0),
+        PROC(CREATE,    createargs,     diropres,       0),
-    PROC(REMOVE,        removeargs,     stat, 0),
+        PROC(REMOVE,    removeargs,     stat,           0),
-    PROC(RENAME,        renameargs,     stat, 0),
+        PROC(RENAME,    renameargs,     stat,           0),
-    PROC(LINK,          linkargs,       stat, 0),
+        PROC(LINK,      linkargs,       stat,           0),
-    PROC(SYMLINK,       symlinkargs,    stat, 0),
+        PROC(SYMLINK,   symlinkargs,    stat,           0),
-    PROC(MKDIR,         createargs,     diropres, 0),
+        PROC(MKDIR,     createargs,     diropres,       0),
-    PROC(RMDIR,         diropargs,      stat, 0),
+        PROC(RMDIR,     diropargs,      stat,           0),
-    PROC(READDIR,       readdirargs,    readdirres, 3),
+        PROC(READDIR,   readdirargs,    readdirres,     3),
-    PROC(STATFS,        fhandle,        statfsres, 0),
+        PROC(STATFS,    fhandle,        statfsres,      0),
 };
 struct rpc_version              nfs_version2 = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index f6cc60f06dac..01c5e8b1941d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,18 +37,16 @@
 #define NFS3_filename_sz        (1+(NFS3_MAXNAMLEN>>2))
 #define NFS3_path_sz            (1+(NFS3_MAXPATHLEN>>2))
 #define NFS3_fattr_sz           (21)
-#define NFS3_wcc_attr_sz                (6)
+#define NFS3_cookieverf_sz      (NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz        (6)
 #define NFS3_pre_op_attr_sz     (1+NFS3_wcc_attr_sz)
 #define NFS3_post_op_attr_sz    (1+NFS3_fattr_sz)
-#define NFS3_wcc_data_sz                (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
+#define NFS3_wcc_data_sz        (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
-#define NFS3_fsstat_sz          
-#define NFS3_fsinfo_sz          
-#define NFS3_pathconf_sz                
-#define NFS3_entry_sz           (NFS3_filename_sz+3)
-#define NFS3_sattrargs_sz       (NFS3_fh_sz+NFS3_sattr_sz+3)
 #define NFS3_diropargs_sz       (NFS3_fh_sz+NFS3_filename_sz)
-#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_getattrargs_sz     (NFS3_fh_sz)
+#define NFS3_setattrargs_sz     (NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_accessargs_sz      (NFS3_fh_sz+1)
 #define NFS3_readlinkargs_sz    (NFS3_fh_sz)
 #define NFS3_readargs_sz        (NFS3_fh_sz+3)
@@ -57,14 +55,16 @@
 #define NFS3_mkdirargs_sz       (NFS3_diropargs_sz+NFS3_sattr_sz)
 #define NFS3_symlinkargs_sz     (NFS3_diropargs_sz+1+NFS3_sattr_sz)
 #define NFS3_mknodargs_sz       (NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz      (NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_renameargs_sz      (NFS3_diropargs_sz+NFS3_diropargs_sz)
 #define NFS3_linkargs_sz                (NFS3_fh_sz+NFS3_diropargs_sz)
-#define NFS3_readdirargs_sz     (NFS3_fh_sz+2)
+#define NFS3_readdirargs_sz     (NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
 #define NFS3_commitargs_sz      (NFS3_fh_sz+3)
-#define NFS3_attrstat_sz        (1+NFS3_fattr_sz)
+#define NFS3_getattrres_sz      (1+NFS3_fattr_sz)
-#define NFS3_wccstat_sz         (1+NFS3_wcc_data_sz)
+#define NFS3_setattrres_sz      (1+NFS3_wcc_data_sz)
-#define NFS3_removeres_sz       (NFS3_wccstat_sz)
+#define NFS3_removeres_sz       (NFS3_setattrres_sz)
 #define NFS3_lookupres_sz       (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
 #define NFS3_accessres_sz       (1+NFS3_post_op_attr_sz+1)
 #define NFS3_readlinkres_sz     (1+NFS3_post_op_attr_sz+1)
@@ -100,1079 +100,2362 @@ static const umode_t nfs_type2fmt[] = {
        [NF3FIFO] = S_IFIFO,
 };
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+                                 unsigned int base, unsigned int len,
+                                 unsigned int bufsize)
+{
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        unsigned int replen;
+        replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
-        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
                "Remaining buffer length is %tu words.\n",
                func, xdr->end - xdr->p);
 }
 /*
- * Common NFS XDR functions as inlines
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions.  For run-time efficiency, some data types are encoded
+ * or decoded inline.
 */
-static inline __be32 *
-xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh)
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
 {
-        return xdr_encode_array(p, fh->data, fh->size);
+        __be32 *p = xdr_reserve_space(xdr, 4);
+        *p = cpu_to_be32(value);
 }
-static inline __be32 *
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
-xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
 {
-        if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
+        __be32 *p;
-                memcpy(fh->data, p, fh->size);
-                return p + XDR_QUADLEN(fh->size);
+        p = xdr_inline_decode(xdr, 4);
-        }
+        if (unlikely(p == NULL))
-        return NULL;
+                goto out_overflow;
+        *value = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        xdr_decode_hyper(p, value);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * fileid3
+ *
+ *      typedef uint64 fileid3;
+ */
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+        return xdr_decode_hyper(p, fileid);
+}
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+        return decode_uint64(xdr, fileid);
+}
+/*
+ * filename3
+ *
+ *      typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+                             const char *name, u32 length)
+{
+        __be32 *p;
+        BUG_ON(length > NFS3_MAXNAMLEN);
+        p = xdr_reserve_space(xdr, 4 + length);
+        xdr_encode_opaque(p, name, length);
 }
-static inline __be32 *
+static int decode_inline_filename3(struct xdr_stream *xdr,
-xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
+                                   const char **name, u32 *length)
 {
        __be32 *p;
+        u32 count;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (count > NFS3_MAXNAMLEN)
+                goto out_nametoolong;
+        p = xdr_inline_decode(xdr, count);
+        if (unlikely(p == NULL))
                goto out_overflow;
-        fh->size = ntohl(*p++);
+        *name = (const char *)p;
+        *length = count;
+        return 0;
-        if (fh->size <= NFS3_FHSIZE) {
+out_nametoolong:
-                p = xdr_inline_decode(xdr, fh->size);
+        dprintk("NFS: returned filename too long: %u\n", count);
-                if (unlikely(!p))
+        return -ENAMETOOLONG;
-                        goto out_overflow;
+out_overflow:
-                memcpy(fh->data, p, fh->size);
+        print_overflow_msg(__func__, xdr);
-                return p + XDR_QUADLEN(fh->size);
+        return -EIO;
-        }
+}
-        return NULL;
+/*
+ * nfspath3
+ *
+ *      typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+                            const u32 length)
+{
+        BUG_ON(length > NFS3_MAXPATHLEN);
+        encode_uint32(xdr, length);
+        xdr_write_pages(xdr, pages, 0, length);
+}
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+        u32 recvd, count;
+        size_t hdrlen;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p);
+        if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+                goto out_nametoolong;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+        xdr_read_pages(xdr, count);
+        xdr_terminate_string(xdr->buf, count);
+        return 0;
+out_nametoolong:
+        dprintk("NFS: returned pathname too long: %u\n", count);
+        return -ENAMETOOLONG;
+out_cheating:
+        dprintk("NFS: server cheating in pathname result: "
+                "count %u > recvd %u\n", count, recvd);
+        return -EIO;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EIO;
 }
 /*
- * Encode/decode time.
+ * cookie3
+ *
+ *      typedef uint64 cookie3
 */
-static inline __be32 *
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
-xdr_encode_time3(__be32 *p, struct timespec *timep)
 {
-        *p++ = htonl(timep->tv_sec);
+        return xdr_encode_hyper(p, cookie);
-        *p++ = htonl(timep->tv_nsec);
-        return p;
 }
-static inline __be32 *
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
-xdr_decode_time3(__be32 *p, struct timespec *timep)
 {
-        timep->tv_sec = ntohl(*p++);
+        return decode_uint64(xdr, cookie);
-        timep->tv_nsec = ntohl(*p++);
+}
-        return p;
+/*
+ * cookieverf3
+ *
+ *      typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+        memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+        return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * createverf3
+ *
+ *      typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+        memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        memcpy(verifier, p, NFS3_WRITEVERFSIZE);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * size3
+ *
+ *      typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+        return xdr_decode_hyper(p, size);
+}
+/*
+ * nfsstat3
+ *
+ *      enum nfsstat3 {
+ *              NFS3_OK = 0,
+ *              ...
+ *      }
+ */
+#define NFS3_OK         NFS_OK
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        *status = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * ftype3
+ *
+ *      enum ftype3 {
+ *              NF3REG  = 1,
+ *              NF3DIR  = 2,
+ *              NF3BLK  = 3,
+ *              NF3CHR  = 4,
+ *              NF3LNK  = 5,
+ *              NF3SOCK = 6,
+ *              NF3FIFO = 7
+ *      };
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+        BUG_ON(type > NF3FIFO);
+        encode_uint32(xdr, type);
 }
-static __be32 *
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
-xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-        unsigned int    type, major, minor;
+        u32 type;
-        umode_t         fmode;
-        type = ntohl(*p++);
+        type = be32_to_cpup(p++);
        if (type > NF3FIFO)
                type = NF3NON;
-        fmode = nfs_type2fmt[type];
+        *mode = nfs_type2fmt[type];
-        fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
+        return p;
-        fattr->nlink = ntohl(*p++);
+}
-        fattr->uid = ntohl(*p++);
-        fattr->gid = ntohl(*p++);
-        p = xdr_decode_hyper(p, &fattr->size);
-        p = xdr_decode_hyper(p, &fattr->du.nfs3.used);
-        /* Turn remote device info into Linux-specific dev_t */
-        major = ntohl(*p++);
-        minor = ntohl(*p++);
-        fattr->rdev = MKDEV(major, minor);
-        if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
-                fattr->rdev = 0;
-        p = xdr_decode_hyper(p, &fattr->fsid.major);
+/*
-        fattr->fsid.minor = 0;
+ * specdata3
-        p = xdr_decode_hyper(p, &fattr->fileid);
+ *
-        p = xdr_decode_time3(p, &fattr->atime);
+ *     struct specdata3 {
-        p = xdr_decode_time3(p, &fattr->mtime);
+ *             uint32  specdata1;
-        p = xdr_decode_time3(p, &fattr->ctime);
+ *             uint32  specdata2;
+ *     };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+        __be32 *p;
-        /* Update the mode bits */
+        p = xdr_reserve_space(xdr, 8);
-        fattr->valid |= NFS_ATTR_FATTR_V3;
+        *p++ = cpu_to_be32(MAJOR(rdev));
+        *p = cpu_to_be32(MINOR(rdev));
+}
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+        unsigned int major, minor;
+        major = be32_to_cpup(p++);
+        minor = be32_to_cpup(p++);
+        *rdev = MKDEV(major, minor);
+        if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+                *rdev = 0;
+        return p;
+}
+/*
+ * nfs_fh3
+ *
+ *      struct nfs_fh3 {
+ *              opaque       data<NFS3_FHSIZE>;
+ *      };
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+        __be32 *p;
+        BUG_ON(fh->size > NFS3_FHSIZE);
+        p = xdr_reserve_space(xdr, 4 + fh->size);
+        xdr_encode_opaque(p, fh->data, fh->size);
+}
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        length = be32_to_cpup(p++);
+        if (unlikely(length > NFS3_FHSIZE))
+                goto out_toobig;
+        p = xdr_inline_decode(xdr, length);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        fh->size = length;
+        memcpy(fh->data, p, length);
+        return 0;
+out_toobig:
+        dprintk("NFS: file handle size (%u) too big\n", length);
+        return -E2BIG;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+        memset(fh, 0, sizeof(*fh));
+}
+/*
+ * nfstime3
+ *
+ *      struct nfstime3 {
+ *              uint32  seconds;
+ *              uint32  nseconds;
+ *      };
+ */
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
+{
+        *p++ = cpu_to_be32(timep->tv_sec);
+        *p++ = cpu_to_be32(timep->tv_nsec);
        return p;
 }
-static inline __be32 *
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
-xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
+        timep->tv_sec = be32_to_cpup(p++);
+        timep->tv_nsec = be32_to_cpup(p++);
+        return p;
+}
+/*
+ * sattr3
+ *
+ *      enum time_how {
+ *              DONT_CHANGE             = 0,
+ *              SET_TO_SERVER_TIME      = 1,
+ *              SET_TO_CLIENT_TIME      = 2
+ *      };
+ *
+ *      union set_mode3 switch (bool set_it) {
+ *      case TRUE:
+ *              mode3   mode;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_uid3 switch (bool set_it) {
+ *      case TRUE:
+ *              uid3    uid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_gid3 switch (bool set_it) {
+ *      case TRUE:
+ *              gid3    gid;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_size3 switch (bool set_it) {
+ *      case TRUE:
+ *              size3   size;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_atime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3        atime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      union set_mtime switch (time_how set_it) {
+ *      case SET_TO_CLIENT_TIME:
+ *              nfstime3  mtime;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct sattr3 {
+ *              set_mode3       mode;
+ *              set_uid3        uid;
+ *              set_gid3        gid;
+ *              set_size3       size;
+ *              set_atime       atime;
+ *              set_mtime       mtime;
+ *      };
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
+{
+        u32 nbytes;
+        __be32 *p;
+        /*
+         * In order to make only a single xdr_reserve_space() call,
+         * pre-compute the total number of bytes to be reserved.
+         * Six boolean values, one for each set_foo field, are always
+         * present in the encoded result, so start there.
+         */
+        nbytes = 6 * 4;
+        if (attr->ia_valid & ATTR_MODE)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_UID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_GID)
+                nbytes += 4;
+        if (attr->ia_valid & ATTR_SIZE)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_ATIME_SET)
+                nbytes += 8;
+        if (attr->ia_valid & ATTR_MTIME_SET)
+                nbytes += 8;
+        p = xdr_reserve_space(xdr, nbytes);
        if (attr->ia_valid & ATTR_MODE) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_mode & S_IALLUGO);
+                *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_UID) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_uid);
+                *p++ = cpu_to_be32(attr->ia_uid);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_GID) {
                *p++ = xdr_one;
-                *p++ = htonl(attr->ia_gid);
+                *p++ = cpu_to_be32(attr->ia_gid);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_SIZE) {
                *p++ = xdr_one;
-                p = xdr_encode_hyper(p, (__u64) attr->ia_size);
+                p = xdr_encode_hyper(p, (u64)attr->ia_size);
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_ATIME_SET) {
                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_atime);
+                p = xdr_encode_nfstime3(p, &attr->ia_atime);
        } else if (attr->ia_valid & ATTR_ATIME) {
                *p++ = xdr_one;
-        } else {
+        } else
                *p++ = xdr_zero;
-        }
        if (attr->ia_valid & ATTR_MTIME_SET) {
                *p++ = xdr_two;
-                p = xdr_encode_time3(p, &attr->ia_mtime);
+                xdr_encode_nfstime3(p, &attr->ia_mtime);
        } else if (attr->ia_valid & ATTR_MTIME) {
-                *p++ = xdr_one;
+                *p = xdr_one;
-        } else {
+        } else
-                *p++ = xdr_zero;
+                *p = xdr_zero;
-        }
+}
-        return p;
+/*
+ * fattr3
+ *
+ *      struct fattr3 {
+ *              ftype3          type;
+ *              mode3           mode;
+ *              uint32          nlink;
+ *              uid3            uid;
+ *              gid3            gid;
+ *              size3           size;
+ *              size3           used;
+ *              specdata3       rdev;
+ *              uint64          fsid;
+ *              fileid3         fileid;
+ *              nfstime3        atime;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        umode_t fmode;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_ftype3(p, &fmode);
+        fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+        fattr->nlink = be32_to_cpup(p++);
+        fattr->uid = be32_to_cpup(p++);
+        fattr->gid = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &fattr->size);
+        p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+        p = xdr_decode_specdata3(p, &fattr->rdev);
+        p = xdr_decode_hyper(p, &fattr->fsid.major);
+        fattr->fsid.minor = 0;
+        p = xdr_decode_fileid3(p, &fattr->fileid);
+        p = xdr_decode_nfstime3(p, &fattr->atime);
+        p = xdr_decode_nfstime3(p, &fattr->mtime);
+        xdr_decode_nfstime3(p, &fattr->ctime);
+        fattr->valid |= NFS_ATTR_FATTR_V3;
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32 *
+/*
-xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
+ * post_op_attr
+ *
+ *      union post_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              fattr3  attributes;
+ *      case FALSE:
+ *              void;
+ *      };
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
-        p = xdr_decode_hyper(p, &fattr->pre_size);
+        __be32 *p;
-        p = xdr_decode_time3(p, &fattr->pre_mtime);
-        p = xdr_decode_time3(p, &fattr->pre_ctime);
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p != xdr_zero)
+                return decode_fattr3(xdr, fattr);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * wcc_attr
+ *      struct wcc_attr {
+ *              size3           size;
+ *              nfstime3        mtime;
+ *              nfstime3        ctime;
+ *      };
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+        if (unlikely(p == NULL))
+                goto out_overflow;
        fattr->valid |= NFS_ATTR_FATTR_PRESIZE
                | NFS_ATTR_FATTR_PREMTIME
                | NFS_ATTR_FATTR_PRECTIME;
-        return p;
-}
-static inline __be32 *
+        p = xdr_decode_size3(p, &fattr->pre_size);
-xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
+        p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
-{
+        xdr_decode_nfstime3(p, &fattr->pre_ctime);
-        if (*p++)
-                p = xdr_decode_fattr(p, fattr);
+        return 0;
-        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static inline __be32 *
+/*
-xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+ * pre_op_attr
+ *      union pre_op_attr switch (bool attributes_follow) {
+ *      case TRUE:
+ *              wcc_attr        attributes;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ * wcc_data
+ *
+ *      struct wcc_data {
+ *              pre_op_attr     before;
+ *              post_op_attr    after;
+ *      };
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 {
        __be32 *p;
        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
+        if (unlikely(p == NULL))
                goto out_overflow;
-        if (ntohl(*p++)) {
+        if (*p != xdr_zero)
-                p = xdr_inline_decode(xdr, 84);
+                return decode_wcc_attr(xdr, fattr);
-                if (unlikely(!p))
+        return 0;
-                        goto out_overflow;
-                p = xdr_decode_fattr(p, fattr);
-        }
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return -EIO;
 }
-static inline __be32 *
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
-xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
-        if (*p++)
+        int error;
-                return xdr_decode_wcc_attr(p, fattr);
-        return p;
+        error = decode_pre_op_attr(xdr, fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, fattr);
+out:
+        return error;
 }
+/*
+ * post_op_fh3
+ *
+ *      union post_op_fh3 switch (bool handle_follows) {
+ *      case TRUE:
+ *              nfs_fh3  handle;
+ *      case FALSE:
+ *              void;
+ *      };
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        __be32 *p = xdr_inline_decode(xdr, 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        if (*p != xdr_zero)
+                return decode_nfs_fh3(xdr, fh);
+        zero_nfs_fh3(fh);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-static inline __be32 *
+/*
-xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
+ * diropargs3
+ *
+ *      struct diropargs3 {
+ *              nfs_fh3         dir;
+ *              filename3       name;
+ *      };
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+                              const char *name, u32 length)
 {
-        p = xdr_decode_pre_op_attr(p, fattr);
+        encode_nfs_fh3(xdr, fh);
-        return xdr_decode_post_op_attr(p, fattr);
+        encode_filename3(xdr, name, length);
 }
 /*
- * NFS encode functions
+ * NFSv3 XDR encode functions
+ *
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
 */
 /*
- * Encode file handle argument
+ * 3.3.1  GETATTR3args
+ *
+ *      struct GETATTR3args {
+ *              nfs_fh3  object;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
-nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
+                                      struct xdr_stream *xdr,
+                                      const struct nfs_fh *fh)
 {
-        p = xdr_encode_fhandle(p, fh);
+        encode_nfs_fh3(xdr, fh);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SETATTR arguments
+ * 3.3.2  SETATTR3args
+ *
+ *      union sattrguard3 switch (bool check) {
+ *      case TRUE:
+ *              nfstime3  obj_ctime;
+ *      case FALSE:
+ *              void;
+ *      };
+ *
+ *      struct SETATTR3args {
+ *              nfs_fh3         object;
+ *              sattr3          new_attributes;
+ *              sattrguard3     guard;
+ *      };
 */
-static int
+static void encode_sattrguard3(struct xdr_stream *xdr,
-nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
+                               const struct nfs3_sattrargs *args)
-{
+{
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
-        p = xdr_encode_sattr(p, args->sattr);
-        *p++ = htonl(args->guard);
+        if (args->guard) {
-        if (args->guard)
+                p = xdr_reserve_space(xdr, 4 + 8);
-                p = xdr_encode_time3(p, &args->guardtime);
+                *p++ = xdr_one;
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+                xdr_encode_nfstime3(p, &args->guardtime);
-        return 0;
+        } else {
+                p = xdr_reserve_space(xdr, 4);
+                *p = xdr_zero;
+        }
+}
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_sattrargs *args)
+{
+        encode_nfs_fh3(xdr, args->fh);
+        encode_sattr3(xdr, args->sattr);
+        encode_sattrguard3(xdr, args);
 }
 /*
- * Encode directory ops argument
+ * 3.3.3  LOOKUP3args
+ *
+ *      struct LOOKUP3args {
+ *              diropargs3  what;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
-nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_diropargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
-        p = xdr_encode_array(p, args->name, args->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode REMOVE argument
+ * 3.3.4  ACCESS3args
+ *
+ *      struct ACCESS3args {
+ *              nfs_fh3         object;
+ *              uint32          access;
+ *      };
 */
-static int
+static void encode_access3args(struct xdr_stream *xdr,
-nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+                               const struct nfs3_accessargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_array(p, args->name.name, args->name.len);
+        encode_uint32(xdr, args->access);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        return 0;
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_accessargs *args)
+{
+        encode_access3args(xdr, args);
 }
 /*
- * Encode access() argument
+ * 3.3.5  READLINK3args
+ *
+ *      struct READLINK3args {
+ *              nfs_fh3 symlink;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
-nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
+                                       struct xdr_stream *xdr,
+                                       const struct nfs3_readlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_nfs_fh3(xdr, args->fh);
-        *p++ = htonl(args->access);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+                                        args->pglen, NFS3_readlinkres_sz);
-        return 0;
 }
 /*
- * Arguments to a READ call. Since we read data directly into the page
+ * 3.3.6  READ3args
- * cache, we also set up the reply iovec here so that iov[1] points
+ *
- * exactly to the page we want to fetch.
+ *      struct READ3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_read3args(struct xdr_stream *xdr,
-nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+                             const struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        __be32 *p;
-        unsigned int replen;
-        u32 count = args->count;
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(count);
+        *p = cpu_to_be32(args->count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+}
-        /* Inline the page array */
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2;
+                                   struct xdr_stream *xdr,
-        xdr_inline_pages(&req->rq_rcv_buf, replen,
+                                   const struct nfs_readargs *args)
-                         args->pages, args->pgbase, count);
+{
+        encode_read3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, args->pgbase,
+                                        args->count, NFS3_readres_sz);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-        return 0;
 }
 /*
- * Write arguments. Splice the buffer to be written into the iovec.
+ * 3.3.7  WRITE3args
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *              stable_how      stable;
+ *              opaque          data<>;
+ *      };
 */
-static int
+static void encode_write3args(struct xdr_stream *xdr,
-nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                              const struct nfs_writeargs *args)
 {
-        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        __be32 *p;
-        u32 count = args->count;
+        encode_nfs_fh3(xdr, args->fh);
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(count);
+        *p++ = cpu_to_be32(args->count);
-        *p++ = htonl(args->stable);
+        *p++ = cpu_to_be32(args->stable);
-        *p++ = htonl(count);
+        *p = cpu_to_be32(args->count);
-        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
+        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
-        /* Copy the page array */
-        xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
-        sndbuf->flags |= XDRBUF_WRITE;
+                                    struct xdr_stream *xdr,
-        return 0;
+                                    const struct nfs_writeargs *args)
+{
+        encode_write3args(xdr, args);
+        xdr->buf->flags |= XDRBUF_WRITE;
 }
 /*
- * Encode CREATE arguments
+ * 3.3.8  CREATE3args
+ *
+ *      enum createmode3 {
+ *              UNCHECKED = 0,
+ *              GUARDED   = 1,
+ *              EXCLUSIVE = 2
+ *      };
+ *
+ *      union createhow3 switch (createmode3 mode) {
+ *      case UNCHECKED:
+ *      case GUARDED:
+ *              sattr3       obj_attributes;
+ *      case EXCLUSIVE:
+ *              createverf3  verf;
+ *      };
+ *
+ *      struct CREATE3args {
+ *              diropargs3      where;
+ *              createhow3      how;
+ *      };
 */
-static int
+static void encode_createhow3(struct xdr_stream *xdr,
-nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
+                              const struct nfs3_createargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_uint32(xdr, args->createmode);
-        p = xdr_encode_array(p, args->name, args->len);
+        switch (args->createmode) {
+        case NFS3_CREATE_UNCHECKED:
-        *p++ = htonl(args->createmode);
+        case NFS3_CREATE_GUARDED:
-        if (args->createmode == NFS3_CREATE_EXCLUSIVE) {
+                encode_sattr3(xdr, args->sattr);
-                *p++ = args->verifier[0];
+                break;
-                *p++ = args->verifier[1];
+        case NFS3_CREATE_EXCLUSIVE:
-        } else
+                encode_createverf3(xdr, args->verifier);
-                p = xdr_encode_sattr(p, args->sattr);
+                break;
+        default:
+                BUG();
+        }
+}
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
-        return 0;
+                                     struct xdr_stream *xdr,
+                                     const struct nfs3_createargs *args)
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_createhow3(xdr, args);
 }
 /*
- * Encode MKDIR arguments
+ * 3.3.9  MKDIR3args
+ *
+ *      struct MKDIR3args {
+ *              diropargs3      where;
+ *              sattr3          attributes;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
-nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mkdirargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode SYMLINK arguments
+ * 3.3.10  SYMLINK3args
+ *
+ *      struct symlinkdata3 {
+ *              sattr3          symlink_attributes;
+ *              nfspath3        symlink_data;
+ *      };
+ *
+ *      struct SYMLINK3args {
+ *              diropargs3      where;
+ *              symlinkdata3    symlink;
+ *      };
 */
-static int
+static void encode_symlinkdata3(struct xdr_stream *xdr,
-nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
+                                const struct nfs3_symlinkargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        encode_nfspath3(xdr, args->pages, args->pathlen);
-        p = xdr_encode_sattr(p, args->sattr);
+}
-        *p++ = htonl(args->pathlen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Copy the page */
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
-        xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen);
+                                      struct xdr_stream *xdr,
-        return 0;
+                                      const struct nfs3_symlinkargs *args)
+{
+        encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+        encode_symlinkdata3(xdr, args);
 }
 /*
- * Encode MKNOD arguments
+ * 3.3.11  MKNOD3args
+ *
+ *      struct devicedata3 {
+ *              sattr3          dev_attributes;
+ *              specdata3       spec;
+ *      };
+ *
+ *      union mknoddata3 switch (ftype3 type) {
+ *      case NF3CHR:
+ *      case NF3BLK:
+ *              devicedata3     device;
+ *      case NF3SOCK:
+ *      case NF3FIFO:
+ *              sattr3          pipe_attributes;
+ *      default:
+ *              void;
+ *      };
+ *
+ *      struct MKNOD3args {
+ *              diropargs3      where;
+ *              mknoddata3      what;
+ *      };
 */
-static int
+static void encode_devicedata3(struct xdr_stream *xdr,
-nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
+                               const struct nfs3_mknodargs *args)
-{
+{
-        p = xdr_encode_fhandle(p, args->fh);
+        encode_sattr3(xdr, args->sattr);
-        p = xdr_encode_array(p, args->name, args->len);
+        encode_specdata3(xdr, args->rdev);
-        *p++ = htonl(args->type);
+}
-        p = xdr_encode_sattr(p, args->sattr);
-        if (args->type == NF3CHR || args->type == NF3BLK) {
+static void encode_mknoddata3(struct xdr_stream *xdr,
-                *p++ = htonl(MAJOR(args->rdev));
+                              const struct nfs3_mknodargs *args)
-                *p++ = htonl(MINOR(args->rdev));
+{
+        encode_ftype3(xdr, args->type);
+        switch (args->type) {
+        case NF3CHR:
+        case NF3BLK:
+                encode_devicedata3(xdr, args);
+                break;
+        case NF3SOCK:
+        case NF3FIFO:
+                encode_sattr3(xdr, args->sattr);
+                break;
+        case NF3REG:
+        case NF3DIR:
+                break;
+        default:
+                BUG();
        }
+}
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
-        return 0;
+                                    struct xdr_stream *xdr,
+                                    const struct nfs3_mknodargs *args)
+{
+        encode_diropargs3(xdr, args->fh, args->name, args->len);
+        encode_mknoddata3(xdr, args);
 }
 /*
- * Encode RENAME arguments
+ * 3.3.12  REMOVE3args
+ *
+ *      struct REMOVE3args {
+ *              diropargs3  object;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
+                                     struct xdr_stream *xdr,
-{
+                                     const struct nfs_removeargs *args)
-        p = xdr_encode_fhandle(p, args->old_dir);
+{
-        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+        encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
-        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
 /*
- * Encode LINK arguments
+ * 3.3.14  RENAME3args
+ *
+ *      struct RENAME3args {
+ *              diropargs3      from;
+ *              diropargs3      to;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
-nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
+                                     struct xdr_stream *xdr,
+                                     const struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        const struct qstr *old = args->old_name;
-        p = xdr_encode_fhandle(p, args->tofh);
+        const struct qstr *new = args->new_name;
-        p = xdr_encode_array(p, args->toname, args->tolen);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        encode_diropargs3(xdr, args->old_dir, old->name, old->len);
-        return 0;
+        encode_diropargs3(xdr, args->new_dir, new->name, new->len);
 }
 /*
- * Encode arguments to readdir call
+ * 3.3.15  LINK3args
+ *
+ *      struct LINK3args {
+ *              nfs_fh3         file;
+ *              diropargs3      link;
+ *      };
 */
-static int
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
-nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
+                                   struct xdr_stream *xdr,
+                                   const struct nfs3_linkargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_nfs_fh3(xdr, args->fromfh);
-        unsigned int replen;
+        encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
-        u32 count = args->count;
-        p = xdr_encode_fhandle(p, args->fh);
-        p = xdr_encode_hyper(p, args->cookie);
-        *p++ = args->verf[0];
-        *p++ = args->verf[1];
-        if (args->plus) {
-                /* readdirplus: need dircount + buffer size.
-                 * We just make sure we make dircount big enough */
-                *p++ = htonl(count >> 3);
-        }
-        *p++ = htonl(count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        /* Inline the page array */
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2;
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
-        return 0;
 }
 /*
- * Decode the result of a readdir call.
+ * 3.3.16  READDIR3args
- * We just check for syntactical correctness.
+ *
+ *      struct READDIR3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_readdir3args(struct xdr_stream *xdr,
-nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
+                                const struct nfs3_readdirargs *args)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        struct page **page;
-        size_t hdrlen;
-        u32 recvd, pglen;
-        int status;
-        status = ntohl(*p++);
-        /* Decode post_op_attrs */
-        p = xdr_decode_post_op_attr(p, res->dir_attr);
-        if (status)
-                return nfs_stat_to_errno(status);
-        /* Decode verifier cookie */
-        if (res->verf) {
-                res->verf[0] = *p++;
-                res->verf[1] = *p++;
-        } else {
-                p += 2;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        encode_nfs_fh3(xdr, args->fh);
-        if (iov->iov_len < hdrlen) {
-                dprintk("NFS: READDIR reply header overflowed:"
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
-                return -errno_NFSERR_IO;
-        } else if (iov->iov_len != hdrlen) {
-                dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
-        }
-        pglen = rcvbuf->page_len;
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
-        recvd = rcvbuf->len - hdrlen;
+        p = xdr_encode_cookie3(p, args->cookie);
-        if (pglen > recvd)
+        p = xdr_encode_cookieverf3(p, args->verf);
-                pglen = recvd;
+        *p = cpu_to_be32(args->count);
-        page = rcvbuf->pages;
+}
-        return pglen;
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      const struct nfs3_readdirargs *args)
+{
+        encode_readdir3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
-__be32 *
+/*
-nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
+ * 3.3.17  READDIRPLUS3args
+ *
+ *      struct READDIRPLUS3args {
+ *              nfs_fh3         dir;
+ *              cookie3         cookie;
+ *              cookieverf3     cookieverf;
+ *              count3          dircount;
+ *              count3          maxcount;
+ *      };
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+                                    const struct nfs3_readdirargs *args)
 {
        __be32 *p;
-        struct nfs_entry old = *entry;
-        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
-                goto out_overflow;
-        if (!ntohl(*p++)) {
-                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
-                        goto out_overflow;
-                if (!ntohl(*p++))
-                        return ERR_PTR(-EAGAIN);
-                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
-        }
-        p = xdr_inline_decode(xdr, 12);
+        encode_nfs_fh3(xdr, args->fh);
-        if (unlikely(!p))
-                goto out_overflow;
-        p = xdr_decode_hyper(p, &entry->ino);
-        entry->len  = ntohl(*p++);
-        p = xdr_inline_decode(xdr, entry->len + 8);
+        p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
-        if (unlikely(!p))
+        p = xdr_encode_cookie3(p, args->cookie);
-                goto out_overflow;
+        p = xdr_encode_cookieverf3(p, args->verf);
-        entry->name = (const char *) p;
-        p += XDR_QUADLEN(entry->len);
-        entry->prev_cookie = entry->cookie;
-        p = xdr_decode_hyper(p, &entry->cookie);
-        entry->d_type = DT_UNKNOWN;
-        if (plus) {
-                entry->fattr->valid = 0;
-                p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
-                if (IS_ERR(p))
-                        goto out_overflow_exit;
-                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-                /* In fact, a post_op_fh3: */
-                p = xdr_inline_decode(xdr, 4);
-                if (unlikely(!p))
-                        goto out_overflow;
-                if (*p++) {
-                        p = xdr_decode_fhandle_stream(xdr, entry->fh);
-                        if (IS_ERR(p))
-                                goto out_overflow_exit;
-                        /* Ugh -- server reply was truncated */
-                        if (p == NULL) {
-                                dprintk("NFS: FH truncated\n");
-                                *entry = old;
-                                return ERR_PTR(-EAGAIN);
-                        }
-                } else
-                        memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
-        }
-        p = xdr_inline_peek(xdr, 8);
+        /*
-        if (p != NULL)
+         * readdirplus: need dircount + buffer size.
-                entry->eof = !p[0] && p[1];
+         * We just make sure we make dircount big enough
-        else
+         */
-                entry->eof = 0;
+        *p++ = cpu_to_be32(args->count >> 3);
-        return p;
+        *p = cpu_to_be32(args->count);
+}
-out_overflow:
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
-        print_overflow_msg(__func__, xdr);
+                                          struct xdr_stream *xdr,
-out_overflow_exit:
+                                          const struct nfs3_readdirargs *args)
-        return ERR_PTR(-EAGAIN);
+{
+        encode_readdirplus3args(xdr, args);
+        prepare_reply_buffer(req, args->pages, 0,
+                                args->count, NFS3_readdirres_sz);
 }
 /*
- * Encode COMMIT arguments
+ * 3.3.21  COMMIT3args
+ *
+ *      struct COMMIT3args {
+ *              nfs_fh3         file;
+ *              offset3         offset;
+ *              count3          count;
+ *      };
 */
-static int
+static void encode_commit3args(struct xdr_stream *xdr,
-nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+                               const struct nfs_writeargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fh);
+        __be32 *p;
+        encode_nfs_fh3(xdr, args->fh);
+        p = xdr_reserve_space(xdr, 8 + 4);
        p = xdr_encode_hyper(p, args->offset);
-        *p++ = htonl(args->count);
+        *p = cpu_to_be32(args->count);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        return 0;
 }
-#ifdef CONFIG_NFS_V3_ACL
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
-/*
+                                     struct xdr_stream *xdr,
- * Encode GETACL arguments
+                                     const struct nfs_writeargs *args)
- */
-static int
-nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
-                    struct nfs3_getaclargs *args)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        encode_commit3args(xdr, args);
-        unsigned int replen;
+}
-        p = xdr_encode_fhandle(p, args->fh);
+#ifdef CONFIG_NFS_V3_ACL
-        *p++ = htonl(args->mask);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        if (args->mask & (NFS_ACL | NFS_DFACL)) {
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
-                /* Inline the page array */
+                                     struct xdr_stream *xdr,
-                replen = (RPC_REPHDRSIZE + auth->au_rslack +
+                                     const struct nfs3_getaclargs *args)
-                          ACL3_getaclres_sz) << 2;
+{
-                xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0,
+        encode_nfs_fh3(xdr, args->fh);
-                                 NFSACL_MAXPAGES << PAGE_SHIFT);
+        encode_uint32(xdr, args->mask);
-        }
+        if (args->mask & (NFS_ACL | NFS_DFACL))
-        return 0;
+                prepare_reply_buffer(req, args->pages, 0,
+                                        NFSACL_MAXPAGES << PAGE_SHIFT,
+                                        ACL3_getaclres_sz);
 }
-/*
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
- * Encode SETACL arguments
+                                     struct xdr_stream *xdr,
- */
+                                     const struct nfs3_setaclargs *args)
-static int
-nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
-                   struct nfs3_setaclargs *args)
 {
-        struct xdr_buf *buf = &req->rq_snd_buf;
        unsigned int base;
-        int err;
+        int error;
-        p = xdr_encode_fhandle(p, NFS_FH(args->inode));
-        *p++ = htonl(args->mask);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
-        base = req->rq_slen;
+        encode_nfs_fh3(xdr, NFS_FH(args->inode));
+        encode_uint32(xdr, args->mask);
        if (args->npages != 0)
-                xdr_encode_pages(buf, args->pages, 0, args->len);
+                xdr_write_pages(xdr, args->pages, 0, args->len);
-        else
-                req->rq_slen = xdr_adjust_iovec(req->rq_svec,
-                                p + XDR_QUADLEN(args->len));
-        err = nfsacl_encode(buf, base, args->inode,
+        base = req->rq_slen;
+        error = nfsacl_encode(xdr->buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
                            args->acl_access : NULL, 1, 0);
-        if (err > 0)
+        BUG_ON(error < 0);
-                err = nfsacl_encode(buf, base + err, args->inode,
+        error = nfsacl_encode(xdr->buf, base + error, args->inode,
-                                    (args->mask & NFS_DFACL) ?
+                            (args->mask & NFS_DFACL) ?
-                                    args->acl_default : NULL, 1,
+                            args->acl_default : NULL, 1,
-                                    NFS_ACL_DEFAULT);
+                            NFS_ACL_DEFAULT);
-        return (err > 0) ? 0 : err;
+        BUG_ON(error < 0);
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 /*
- * NFS XDR decode functions
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
 */
 /*
- * Decode attrstat reply.
+ * 3.3.1  GETATTR3res
+ *
+ *      struct GETATTR3resok {
+ *              fattr3          obj_attributes;
+ *      };
+ *
+ *      union GETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              GETATTR3resok  resok;
+ *      default:
+ *              void;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
-nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++)))
-                return nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        xdr_decode_fattr(p, fattr);
+        if (unlikely(error))
-        return 0;
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_fattr3(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode status+wcc_data reply
+ * 3.3.2  SETATTR3res
- * SATTR, REMOVE, RMDIR
+ *
+ *      struct SETATTR3resok {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      struct SETATTR3resfail {
+ *              wcc_data  obj_wcc;
+ *      };
+ *
+ *      union SETATTR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              SETATTR3resok   resok;
+ *      default:
+ *              SETATTR3resfail resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
-nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                                    struct xdr_stream *xdr,
+                                    struct nfs_fattr *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++)))
-                status = nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        xdr_decode_wcc_data(p, fattr);
+        if (unlikely(error))
-        return status;
+                goto out;
+        error = decode_wcc_data(xdr, result);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
-static int
+/*
-nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
+ * 3.3.3  LOOKUP3res
+ *
+ *      struct LOOKUP3resok {
+ *              nfs_fh3         object;
+ *              post_op_attr    obj_attributes;
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      struct LOOKUP3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union LOOKUP3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LOOKUP3resok    resok;
+ *      default:
+ *              LOOKUP3resfail  resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_diropres *result)
 {
-        return nfs3_xdr_wccstat(req, p, res->dir_attr);
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_nfs_fh3(xdr, result->fh);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->dir_attr);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode LOOKUP reply
+ * 3.3.4  ACCESS3res
+ *
+ *      struct ACCESS3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          access;
+ *      };
+ *
+ *      struct ACCESS3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union ACCESS3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              ACCESS3resok    resok;
+ *      default:
+ *              ACCESS3resfail  resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
-nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_accessres *result)
 {
-        int     status;
+        enum nfs_stat status;
+        int error;
-        if ((status = ntohl(*p++))) {
-                status = nfs_stat_to_errno(status);
+        error = decode_nfsstat3(xdr, &status);
-        } else {
+        if (unlikely(error))
-                if (!(p = xdr_decode_fhandle(p, res->fh)))
+                goto out;
-                        return -errno_NFSERR_IO;
+        error = decode_post_op_attr(xdr, result->fattr);
-                p = xdr_decode_post_op_attr(p, res->fattr);
+        if (unlikely(error))
-        }
+                goto out;
-        xdr_decode_post_op_attr(p, res->dir_attr);
+        if (status != NFS3_OK)
-        return status;
+                goto out_default;
+        error = decode_uint32(xdr, &result->access);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode ACCESS reply
+ * 3.3.5  READLINK3res
+ *
+ *      struct READLINK3resok {
+ *              post_op_attr    symlink_attributes;
+ *              nfspath3        data;
+ *      };
+ *
+ *      struct READLINK3resfail {
+ *              post_op_attr    symlink_attributes;
+ *      };
+ *
+ *      union READLINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READLINK3resok  resok;
+ *      default:
+ *              READLINK3resfail resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
-nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
+                                     struct xdr_stream *xdr,
+                                     struct nfs_fattr *result)
 {
-        int     status = ntohl(*p++);
+        enum nfs_stat status;
+        int error;
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status)
+        error = decode_nfsstat3(xdr, &status);
-                return nfs_stat_to_errno(status);
+        if (unlikely(error))
-        res->access = ntohl(*p++);
+                goto out;
-        return 0;
+        error = decode_post_op_attr(xdr, result);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_nfspath3(xdr);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
-static int
+/*
-nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
+ * 3.3.6  READ3res
+ *
+ *      struct READ3resok {
+ *              post_op_attr    file_attributes;
+ *              count3          count;
+ *              bool            eof;
+ *              opaque          data<>;
+ *      };
+ *
+ *      struct READ3resfail {
+ *              post_op_attr    file_attributes;
+ *      };
+ *
+ *      union READ3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READ3resok      resok;
+ *      default:
+ *              READ3resfail    resfail;
+ *      };
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+                             struct nfs_readres *result)
 {
-        struct rpc_auth *auth = req->rq_cred->cr_auth;
+        u32 eof, count, ocount, recvd;
-        unsigned int replen;
+        size_t hdrlen;
+        __be32 *p;
-        p = xdr_encode_fhandle(p, args->fh);
+        p = xdr_inline_decode(xdr, 4 + 4 + 4);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        count = be32_to_cpup(p++);
+        eof = be32_to_cpup(p++);
+        ocount = be32_to_cpup(p++);
+        if (unlikely(ocount != count))
+                goto out_mismatch;
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+        recvd = xdr->buf->len - hdrlen;
+        if (unlikely(count > recvd))
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, count);
+        result->eof = eof;
+        result->count = count;
+        return count;
+out_mismatch:
+        dprintk("NFS: READ count doesn't match length of opaque: "
+                "count %u != ocount %u\n", count, ocount);
+        return -EIO;
+out_cheating:
+        dprintk("NFS: server cheating in read result: "
+                "count %u > recvd %u\n", count, recvd);
+        count = recvd;
+        eof = 0;
+        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
-        /* Inline the page array */
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2;
+                                 struct nfs_readres *result)
-        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
+{
-        return 0;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_read3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READLINK reply
+ * 3.3.7  WRITE3res
+ *
+ *      enum stable_how {
+ *              UNSTABLE  = 0,
+ *              DATA_SYNC = 1,
+ *              FILE_SYNC = 2
+ *      };
+ *
+ *      struct WRITE3resok {
+ *              wcc_data        file_wcc;
+ *              count3          count;
+ *              stable_how      committed;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct WRITE3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union WRITE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              WRITE3resok     resok;
+ *      default:
+ *              WRITE3resfail   resfail;
+ *      };
 */
-static int
+static int decode_write3resok(struct xdr_stream *xdr,
-nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
+                              struct nfs_writeres *result)
 {
-        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        __be32 *p;
-        struct kvec *iov = rcvbuf->head;
-        size_t hdrlen;
-        u32 len, recvd;
-        int     status;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        /* Convert length of symlink */
-        len = ntohl(*p++);
-        if (len >= rcvbuf->page_len) {
-                dprintk("nfs: server returned giant symlink!\n");
-                return -ENAMETOOLONG;
-        }
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
-        if (iov->iov_len < hdrlen) {
+        if (unlikely(p == NULL))
-                dprintk("NFS: READLINK reply header overflowed:"
+                goto out_overflow;
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+        result->count = be32_to_cpup(p++);
-                return -errno_NFSERR_IO;
+        result->verf->committed = be32_to_cpup(p++);
-        } else if (iov->iov_len != hdrlen) {
+        if (unlikely(result->verf->committed > NFS_FILE_SYNC))
-                dprintk("NFS: READLINK header is short. "
+                goto out_badvalue;
-                        "iovec will be shifted.\n");
+        memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
-                xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
+        return result->count;
-        }
+out_badvalue:
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
-        if (recvd < len) {
+        return -EIO;
-                dprintk("NFS: server cheating in readlink reply: "
+out_overflow:
-                                "count %u > recvd %u\n", len, recvd);
+        print_overflow_msg(__func__, xdr);
-                return -EIO;
+        return -EIO;
-        }
+}
-        xdr_terminate_string(rcvbuf, len);
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
-        return 0;
+                                  struct nfs_writeres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_write3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode READ reply
+ * 3.3.8  CREATE3res
+ *
+ *      struct CREATE3resok {
+ *              post_op_fh3     obj;
+ *              post_op_attr    obj_attributes;
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      struct CREATE3resfail {
+ *              wcc_data        dir_wcc;
+ *      };
+ *
+ *      union CREATE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              CREATE3resok    resok;
+ *      default:
+ *              CREATE3resfail  resfail;
+ *      };
 */
-static int
+static int decode_create3resok(struct xdr_stream *xdr,
-nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
+                               struct nfs3_diropres *result)
 {
-        struct kvec *iov = req->rq_rcv_buf.head;
+        int error;
-        size_t hdrlen;
-        u32 count, ocount, recvd;
+        error = decode_post_op_fh3(xdr, result->fh);
-        int status;
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        /* The server isn't required to return a file handle.
+         * If it didn't, force the client to perform a LOOKUP
+         * to determine the correct file handle and attribute
+         * values for the new object. */
+        if (result->fh->size == 0)
+                result->fattr->valid = 0;
+        error = decode_wcc_data(xdr, result->dir_attr);
+out:
+        return error;
+}
-        status = ntohl(*p++);
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
-        p = xdr_decode_post_op_attr(p, res->fattr);
+                                   struct xdr_stream *xdr,
+                                   struct nfs3_diropres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_create3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
+}
-        if (status != 0)
+/*
-                return nfs_stat_to_errno(status);
+ * 3.3.12  REMOVE3res
+ *
+ *      struct REMOVE3resok {
+ *              wcc_data    dir_wcc;
+ *      };
+ *
+ *      struct REMOVE3resfail {
+ *              wcc_data    dir_wcc;
+ *      };
+ *
+ *      union REMOVE3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              REMOVE3resok   resok;
+ *      default:
+ *              REMOVE3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_removeres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        /* Decode reply count and EOF flag. NFSv3 is somewhat redundant
+/*
-         * in that it puts the count both in the res struct and in the
+ * 3.3.14  RENAME3res
-         * opaque data count. */
+ *
-        count    = ntohl(*p++);
+ *      struct RENAME3resok {
-        res->eof = ntohl(*p++);
+ *              wcc_data        fromdir_wcc;
-        ocount   = ntohl(*p++);
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      struct RENAME3resfail {
+ *              wcc_data        fromdir_wcc;
+ *              wcc_data        todir_wcc;
+ *      };
+ *
+ *      union RENAME3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              RENAME3resok   resok;
+ *      default:
+ *              RENAME3resfail resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_renameres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->old_fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->new_fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        if (ocount != count) {
+/*
-                dprintk("NFS: READ count doesn't match RPC opaque count.\n");
+ * 3.3.15  LINK3res
-                return -errno_NFSERR_IO;
+ *
-        }
+ *      struct LINK3resok {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      struct LINK3resfail {
+ *              post_op_attr    file_attributes;
+ *              wcc_data        linkdir_wcc;
+ *      };
+ *
+ *      union LINK3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              LINK3resok      resok;
+ *      default:
+ *              LINK3resfail    resfail;
+ *      };
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs3_linkres *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        error = decode_wcc_data(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
+}
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+/**
-        if (iov->iov_len < hdrlen) {
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
-                dprintk("NFS: READ reply header overflowed:"
+ *                      the local page cache
-                                "length %Zu > %Zu\n", hdrlen, iov->iov_len);
+ * @xdr: XDR stream where entry resides
-                return -errno_NFSERR_IO;
+ * @entry: buffer to fill in with entry data
-        } else if (iov->iov_len != hdrlen) {
+ * @plus: boolean indicating whether this should be a readdirplus entry
-                dprintk("NFS: READ header is short. iovec will be shifted.\n");
+ *
-                xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
+ * Returns zero if successful, otherwise a negative errno value is
-        }
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16  entry3
+ *
+ *      struct entry3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              fhandle3        filehandle;
+ *              post_op_attr3   attributes;
+ *              entry3          *nextentry;
+ *      };
+ *
+ * 3.3.17  entryplus3
+ *      struct entryplus3 {
+ *              fileid3         fileid;
+ *              filename3       name;
+ *              cookie3         cookie;
+ *              post_op_attr    name_attributes;
+ *              post_op_fh3     name_handle;
+ *              entryplus3      *nextentry;
+ *      };
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
+{
+        struct nfs_entry old = *entry;
+        __be32 *p;
+        int error;
-        recvd = req->rq_rcv_buf.len - hdrlen;
+        p = xdr_inline_decode(xdr, 4);
-        if (count > recvd) {
+        if (unlikely(p == NULL))
-                dprintk("NFS: server cheating in read reply: "
+                goto out_overflow;
-                        "count %u > recvd %u\n", count, recvd);
+        if (*p == xdr_zero) {
-                count = recvd;
+                p = xdr_inline_decode(xdr, 4);
-                res->eof = 0;
+                if (unlikely(p == NULL))
+                        goto out_overflow;
+                if (*p == xdr_zero)
+                        return -EAGAIN;
+                entry->eof = 1;
+                return -EBADCOOKIE;
        }
-        if (count < res->count)
+        error = decode_fileid3(xdr, &entry->ino);
-                res->count = count;
+        if (unlikely(error))
+                return error;
-        return count;
+        error = decode_inline_filename3(xdr, &entry->name, &entry->len);
-}
+        if (unlikely(error))
+                return error;
-/*
+        entry->prev_cookie = entry->cookie;
- * Decode WRITE response
+        error = decode_cookie3(xdr, &entry->cookie);
- */
+        if (unlikely(error))
-static int
+                return error;
-nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
-{
-        int     status;
-        status = ntohl(*p++);
+        entry->d_type = DT_UNKNOWN;
-        p = xdr_decode_wcc_data(p, res->fattr);
-        if (status != 0)
+        if (plus) {
-                return nfs_stat_to_errno(status);
+                entry->fattr->valid = 0;
+                error = decode_post_op_attr(xdr, entry->fattr);
+                if (unlikely(error))
+                        return error;
+                if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+                        entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-        res->count = ntohl(*p++);
+                /* In fact, a post_op_fh3: */
-        res->verf->committed = (enum nfs3_stable_how)ntohl(*p++);
+                p = xdr_inline_decode(xdr, 4);
-        res->verf->verifier[0] = *p++;
+                if (unlikely(p == NULL))
-        res->verf->verifier[1] = *p++;
+                        goto out_overflow;
+                if (*p != xdr_zero) {
+                        error = decode_nfs_fh3(xdr, entry->fh);
+                        if (unlikely(error)) {
+                                if (error == -E2BIG)
+                                        goto out_truncated;
+                                return error;
+                        }
+                } else
+                        zero_nfs_fh3(entry->fh);
+        }
-        return res->count;
+        return 0;
-}
-/*
+out_overflow:
- * Decode a CREATE response
+        print_overflow_msg(__func__, xdr);
- */
+        return -EAGAIN;
-static int
+out_truncated:
-nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
+        dprintk("NFS: directory entry contains invalid file handle\n");
-{
+        *entry = old;
-        int     status;
+        return -EAGAIN;
-        status = ntohl(*p++);
-        if (status == 0) {
-                if (*p++) {
-                        if (!(p = xdr_decode_fhandle(p, res->fh)))
-                                return -errno_NFSERR_IO;
-                        p = xdr_decode_post_op_attr(p, res->fattr);
-                } else {
-                        memset(res->fh, 0, sizeof(*res->fh));
-                        /* Do decode post_op_attr but set it to NULL */
-                        p = xdr_decode_post_op_attr(p, res->fattr);
-                        res->fattr->valid = 0;
-                }
-        } else {
-                status = nfs_stat_to_errno(status);
-        }
-        p = xdr_decode_wcc_data(p, res->dir_attr);
-        return status;
 }
 /*
- * Decode RENAME reply
+ * 3.3.16  READDIR3res
+ *
+ *      struct dirlist3 {
+ *              entry3          *entries;
+ *              bool            eof;
+ *      };
+ *
+ *      struct READDIR3resok {
+ *              post_op_attr    dir_attributes;
+ *              cookieverf3     cookieverf;
+ *              dirlist3        reply;
+ *      };
+ *
+ *      struct READDIR3resfail {
+ *              post_op_attr    dir_attributes;
+ *      };
+ *
+ *      union READDIR3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              READDIR3resok   resok;
+ *      default:
+ *              READDIR3resfail resfail;
+ *      };
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them.  The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
 */
-static int
+static int decode_dirlist3(struct xdr_stream *xdr)
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
 {
-        int     status;
+        u32 recvd, pglen;
+        size_t hdrlen;
-        if ((status = ntohl(*p++)) != 0)
+        pglen = xdr->buf->page_len;
-                status = nfs_stat_to_errno(status);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        p = xdr_decode_wcc_data(p, res->old_fattr);
+        recvd = xdr->buf->len - hdrlen;
-        p = xdr_decode_wcc_data(p, res->new_fattr);
+        if (unlikely(pglen > recvd))
-        return status;
+                goto out_cheating;
+out:
+        xdr_read_pages(xdr, pglen);
+        return pglen;
+out_cheating:
+        dprintk("NFS: server cheating in readdir result: "
+                "pglen %u > recvd %u\n", pglen, recvd);
+        pglen = recvd;
+        goto out;
 }
-/*
+static int decode_readdir3resok(struct xdr_stream *xdr,
- * Decode LINK reply
+                                struct nfs3_readdirres *result)
- */
-static int
-nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
 {
-        int     status;
+        int error;
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        /* XXX: do we need to check if result->verf != NULL ? */
+        error = decode_cookieverf3(xdr, result->verf);
+        if (unlikely(error))
+                goto out;
+        error = decode_dirlist3(xdr);
+out:
+        return error;
+}
-        if ((status = ntohl(*p++)) != 0)
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
-                status = nfs_stat_to_errno(status);
+                                    struct xdr_stream *xdr,
-        p = xdr_decode_post_op_attr(p, res->fattr);
+                                    struct nfs3_readdirres *result)
-        p = xdr_decode_wcc_data(p, res->dir_attr);
+{
-        return status;
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_readdir3resok(xdr, result);
+out:
+        return error;
+out_default:
+        error = decode_post_op_attr(xdr, result->dir_attr);
+        if (unlikely(error))
+                goto out;
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSSTAT reply
+ * 3.3.18  FSSTAT3res
+ *
+ *      struct FSSTAT3resok {
+ *              post_op_attr    obj_attributes;
+ *              size3           tbytes;
+ *              size3           fbytes;
+ *              size3           abytes;
+ *              size3           tfiles;
+ *              size3           ffiles;
+ *              size3           afiles;
+ *              uint32          invarsec;
+ *      };
+ *
+ *      struct FSSTAT3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSSTAT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSSTAT3resok    resok;
+ *      default:
+ *              FSSTAT3resfail  resfail;
+ *      };
 */
-static int
+static int decode_fsstat3resok(struct xdr_stream *xdr,
-nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
+                               struct nfs_fsstat *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        p = xdr_decode_hyper(p, &res->tbytes);
-        p = xdr_decode_hyper(p, &res->fbytes);
-        p = xdr_decode_hyper(p, &res->abytes);
-        p = xdr_decode_hyper(p, &res->tfiles);
-        p = xdr_decode_hyper(p, &res->ffiles);
-        p = xdr_decode_hyper(p, &res->afiles);
+        p = xdr_inline_decode(xdr, 8 * 6 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        p = xdr_decode_size3(p, &result->tbytes);
+        p = xdr_decode_size3(p, &result->fbytes);
+        p = xdr_decode_size3(p, &result->abytes);
+        p = xdr_decode_size3(p, &result->tfiles);
+        p = xdr_decode_size3(p, &result->ffiles);
+        xdr_decode_size3(p, &result->afiles);
        /* ignore invarsec */
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fsstat *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsstat3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode FSINFO reply
+ * 3.3.19  FSINFO3res
+ *
+ *      struct FSINFO3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          rtmax;
+ *              uint32          rtpref;
+ *              uint32          rtmult;
+ *              uint32          wtmax;
+ *              uint32          wtpref;
+ *              uint32          wtmult;
+ *              uint32          dtpref;
+ *              size3           maxfilesize;
+ *              nfstime3        time_delta;
+ *              uint32          properties;
+ *      };
+ *
+ *      struct FSINFO3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union FSINFO3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              FSINFO3resok    resok;
+ *      default:
+ *              FSINFO3resfail  resfail;
+ *      };
 */
-static int
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
-nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
+                               struct nfs_fsinfo *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        res->rtmax  = ntohl(*p++);
+        p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
-        res->rtpref = ntohl(*p++);
+        if (unlikely(p == NULL))
-        res->rtmult = ntohl(*p++);
+                goto out_overflow;
-        res->wtmax  = ntohl(*p++);
+        result->rtmax  = be32_to_cpup(p++);
-        res->wtpref = ntohl(*p++);
+        result->rtpref = be32_to_cpup(p++);
-        res->wtmult = ntohl(*p++);
+        result->rtmult = be32_to_cpup(p++);
-        res->dtpref = ntohl(*p++);
+        result->wtmax  = be32_to_cpup(p++);
-        p = xdr_decode_hyper(p, &res->maxfilesize);
+        result->wtpref = be32_to_cpup(p++);
-        p = xdr_decode_time3(p, &res->time_delta);
+        result->wtmult = be32_to_cpup(p++);
+        result->dtpref = be32_to_cpup(p++);
+        p = xdr_decode_size3(p, &result->maxfilesize);
+        xdr_decode_nfstime3(p, &result->time_delta);
        /* ignore properties */
-        res->lease_time = 0;
+        result->lease_time = 0;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   struct nfs_fsinfo *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_fsinfo3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode PATHCONF reply
+ * 3.3.20  PATHCONF3res
+ *
+ *      struct PATHCONF3resok {
+ *              post_op_attr    obj_attributes;
+ *              uint32          linkmax;
+ *              uint32          name_max;
+ *              bool            no_trunc;
+ *              bool            chown_restricted;
+ *              bool            case_insensitive;
+ *              bool            case_preserving;
+ *      };
+ *
+ *      struct PATHCONF3resfail {
+ *              post_op_attr    obj_attributes;
+ *      };
+ *
+ *      union PATHCONF3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              PATHCONF3resok  resok;
+ *      default:
+ *              PATHCONF3resfail resfail;
+ *      };
 */
-static int
+static int decode_pathconf3resok(struct xdr_stream *xdr,
-nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
+                                 struct nfs_pathconf *result)
 {
-        int             status;
+        __be32 *p;
-        status = ntohl(*p++);
-        p = xdr_decode_post_op_attr(p, res->fattr);
-        if (status != 0)
-                return nfs_stat_to_errno(status);
-        res->max_link = ntohl(*p++);
-        res->max_namelen = ntohl(*p++);
+        p = xdr_inline_decode(xdr, 4 * 6);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        result->max_link = be32_to_cpup(p++);
+        result->max_namelen = be32_to_cpup(p);
        /* ignore remaining fields */
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_pathconf *result)
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        error = decode_post_op_attr(xdr, result->fattr);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_pathconf3resok(xdr, result);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 /*
- * Decode COMMIT reply
+ * 3.3.21  COMMIT3res
+ *
+ *      struct COMMIT3resok {
+ *              wcc_data        file_wcc;
+ *              writeverf3      verf;
+ *      };
+ *
+ *      struct COMMIT3resfail {
+ *              wcc_data        file_wcc;
+ *      };
+ *
+ *      union COMMIT3res switch (nfsstat3 status) {
+ *      case NFS3_OK:
+ *              COMMIT3resok    resok;
+ *      default:
+ *              COMMIT3resfail  resfail;
+ *      };
 */
-static int
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
-nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
+                                   struct xdr_stream *xdr,
+                                   struct nfs_writeres *result)
 {
-        int             status;
+        enum nfs_stat status;
+        int error;
-        status = ntohl(*p++);
-        p = xdr_decode_wcc_data(p, res->fattr);
+        error = decode_nfsstat3(xdr, &status);
-        if (status != 0)
+        if (unlikely(error))
-                return nfs_stat_to_errno(status);
+                goto out;
+        error = decode_wcc_data(xdr, result->fattr);
-        res->verf->verifier[0] = *p++;
+        if (unlikely(error))
-        res->verf->verifier[1] = *p++;
+                goto out;
-        return 0;
+        if (status != NFS3_OK)
+                goto out_status;
+        error = decode_writeverf3(xdr, result->verf->verifier);
+out:
+        return error;
+out_status:
+        return nfs_stat_to_errno(status);
 }
 #ifdef CONFIG_NFS_V3_ACL
-/*
- * Decode GETACL reply
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
- */
+                                      struct nfs3_getaclres *result)
-static int
-nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
-                   struct nfs3_getaclres *res)
 {
-        struct xdr_buf *buf = &req->rq_rcv_buf;
-        int status = ntohl(*p++);
        struct posix_acl **acl;
        unsigned int *aclcnt;
-        int err, base;
+        size_t hdrlen;
+        int error;
-        if (status != 0)
-                return nfs_stat_to_errno(status);
+        error = decode_post_op_attr(xdr, result->fattr);
-        p = xdr_decode_post_op_attr(p, res->fattr);
+        if (unlikely(error))
-        res->mask = ntohl(*p++);
+                goto out;
-        if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+        error = decode_uint32(xdr, &result->mask);
-                return -EINVAL;
+        if (unlikely(error))
-        base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base;
+                goto out;
+        error = -EINVAL;
-        acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL;
+        if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
-        aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL;
+                goto out;
-        err = nfsacl_decode(buf, base, aclcnt, acl);
+        hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
-        acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL;
-        aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL;
+        acl = NULL;
-        if (err > 0)
+        if (result->mask & NFS_ACL)
-                err = nfsacl_decode(buf, base + err, aclcnt, acl);
+                acl = &result->acl_access;
-        return (err > 0) ? 0 : err;
+        aclcnt = NULL;
+        if (result->mask & NFS_ACLCNT)
+                aclcnt = &result->acl_access_count;
+        error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+        if (unlikely(error <= 0))
+                goto out;
+        acl = NULL;
+        if (result->mask & NFS_DFACL)
+                acl = &result->acl_default;
+        aclcnt = NULL;
+        if (result->mask & NFS_DFACLCNT)
+                aclcnt = &result->acl_default_count;
+        error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+        if (unlikely(error <= 0))
+                return error;
+        error = 0;
+out:
+        return error;
 }
-/*
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
- * Decode setacl reply.
+                                   struct xdr_stream *xdr,
- */
+                                   struct nfs3_getaclres *result)
-static int
-nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
-        int status = ntohl(*p++);
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_getacl3resok(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
+}
-        if (status)
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
-                return nfs_stat_to_errno(status);
+                                   struct xdr_stream *xdr,
-        xdr_decode_post_op_attr(p, fattr);
+                                   struct nfs_fattr *result)
-        return 0;
+{
+        enum nfs_stat status;
+        int error;
+        error = decode_nfsstat3(xdr, &status);
+        if (unlikely(error))
+                goto out;
+        if (status != NFS3_OK)
+                goto out_default;
+        error = decode_post_op_attr(xdr, result);
+out:
+        return error;
+out_default:
+        return nfs_stat_to_errno(status);
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 #define PROC(proc, argtype, restype, timer)                             \
 [NFS3PROC_##proc] = {                                                   \
        .p_proc      = NFS3PROC_##proc,                                 \
-        .p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,                 \
+        .p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,      \
-        .p_decode    = (kxdrproc_t) nfs3_xdr_##restype,                 \
+        .p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,       \
-        .p_arglen    = NFS3_##argtype##_sz,                             \
+        .p_arglen    = NFS3_##argtype##args_sz,                         \
-        .p_replen    = NFS3_##restype##_sz,                             \
+        .p_replen    = NFS3_##restype##res_sz,                          \
        .p_timer     = timer,                                           \
        .p_statidx   = NFS3PROC_##proc,                                 \
        .p_name      = #proc,                                           \
        }
 struct rpc_procinfo     nfs3_procedures[] = {
-  PROC(GETATTR,         fhandle,        attrstat, 1),
+        PROC(GETATTR,           getattr,        getattr,        1),
-  PROC(SETATTR,         sattrargs,      wccstat, 0),
+        PROC(SETATTR,           setattr,        setattr,        0),
-  PROC(LOOKUP,          diropargs,      lookupres, 2),
+        PROC(LOOKUP,            lookup,         lookup,         2),
-  PROC(ACCESS,          accessargs,     accessres, 1),
+        PROC(ACCESS,            access,         access,         1),
-  PROC(READLINK,        readlinkargs,   readlinkres, 3),
+        PROC(READLINK,          readlink,       readlink,       3),
-  PROC(READ,            readargs,       readres, 3),
+        PROC(READ,              read,           read,           3),
-  PROC(WRITE,           writeargs,      writeres, 4),
+        PROC(WRITE,             write,          write,          4),
-  PROC(CREATE,          createargs,     createres, 0),
+        PROC(CREATE,            create,         create,         0),
-  PROC(MKDIR,           mkdirargs,      createres, 0),
+        PROC(MKDIR,             mkdir,          create,         0),
-  PROC(SYMLINK,         symlinkargs,    createres, 0),
+        PROC(SYMLINK,           symlink,        create,         0),
-  PROC(MKNOD,           mknodargs,      createres, 0),
+        PROC(MKNOD,             mknod,          create,         0),
-  PROC(REMOVE,          removeargs,     removeres, 0),
+        PROC(REMOVE,            remove,         remove,         0),
-  PROC(RMDIR,           diropargs,      wccstat, 0),
+        PROC(RMDIR,             lookup,         setattr,        0),
-  PROC(RENAME,          renameargs,     renameres, 0),
+        PROC(RENAME,            rename,         rename,         0),
-  PROC(LINK,            linkargs,       linkres, 0),
+        PROC(LINK,              link,           link,           0),
-  PROC(READDIR,         readdirargs,    readdirres, 3),
+        PROC(READDIR,           readdir,        readdir,        3),
-  PROC(READDIRPLUS,     readdirargs,    readdirres, 3),
+        PROC(READDIRPLUS,       readdirplus,    readdir,        3),
-  PROC(FSSTAT,          fhandle,        fsstatres, 0),
+        PROC(FSSTAT,            getattr,        fsstat,         0),
-  PROC(FSINFO,          fhandle,        fsinfores, 0),
+        PROC(FSINFO,            getattr,        fsinfo,         0),
-  PROC(PATHCONF,        fhandle,        pathconfres, 0),
+        PROC(PATHCONF,          getattr,        pathconf,       0),
-  PROC(COMMIT,          commitargs,     commitres, 5),
+        PROC(COMMIT,            commit,         commit,         5),
 };
 struct rpc_version              nfs_version3 = {
@@ -1185,8 +2468,8 @@ struct rpc_version		nfs_version3 = {
 static struct rpc_procinfo      nfs3_acl_procedures[] = {
        [ACLPROC3_GETACL] = {
                .p_proc = ACLPROC3_GETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
                .p_arglen = ACL3_getaclargs_sz,
                .p_replen = ACL3_getaclres_sz,
                .p_timer = 1,
@@ -1194,8 +2477,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
        },
        [ACLPROC3_SETACL] = {
                .p_proc = ACLPROC3_SETACL,
-                .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
+                .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
-                .p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
+                .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
                .p_arglen = ACL3_setaclargs_sz,
                .p_replen = ACL3_setaclres_sz,
                .p_timer = 0,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9fa496387fdf..7a7474073148 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_REBOOT,
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
+        NFS4CLNT_LAYOUTRECALL,
        NFS4CLNT_SESSION_RESET,
        NFS4CLNT_RECALL_SLOT,
 };
@@ -109,7 +110,7 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
        struct nfs_unique_id so_owner_id;
        struct nfs_server    *so_server;
-        struct rb_node       so_client_node;
+        struct rb_node       so_server_node;
        struct rpc_cred      *so_cred;   /* Associated cred */
@@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops {
 extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
-/* inode.c */
-extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
-extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
-extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
@@ -241,11 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -331,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e92f0d8d654..23f930caf1e2 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 {
        struct nfs4_file_layout_dsaddr *dsaddr;
        int status = -EINVAL;
-        struct nfs_server *nfss = NFS_SERVER(lo->inode);
+        struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
        dprintk("--> %s\n", __func__);
@@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        /* find and reference the deviceid */
        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
        if (dsaddr == NULL) {
-                dsaddr = get_device_info(lo->inode, id);
+                dsaddr = get_device_info(lo->plh_inode, id);
                if (dsaddr == NULL)
                        goto out;
        }
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 static void
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 {
-        struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+        struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4435e5e1f904..9d992b0346e3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -49,6 +49,7 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/xattr.h>
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -355,9 +356,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 }
 /*
- * Signal state manager thread if session is drained
+ * Signal state manager thread if session fore channel is drained
 */
-static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
+static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
 {
        struct rpc_task *task;
@@ -371,8 +372,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
        if (ses->fc_slot_table.highest_used_slotid != -1)
                return;
-        dprintk("%s COMPLETE: Session Drained\n", __func__);
+        dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
-        complete(&ses->complete);
+        complete(&ses->fc_slot_table.complete);
+}
+/*
+ * Signal state manager thread if session back channel is drained
+ */
+void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
+{
+        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
+            ses->bc_slot_table.highest_used_slotid != -1)
+                return;
+        dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
+        complete(&ses->bc_slot_table.complete);
 }
 static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -389,7 +402,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        spin_lock(&tbl->slot_tbl_lock);
        nfs4_free_slot(tbl, res->sr_slot);
-        nfs41_check_drain_session_complete(res->sr_session);
+        nfs4_check_drain_fc_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slot = NULL;
 }
@@ -1826,6 +1839,8 @@ struct nfs4_closedata {
        struct nfs_closeres res;
        struct nfs_fattr fattr;
        unsigned long timestamp;
+        bool roc;
+        u32 roc_barrier;
 };
 static void nfs4_free_closedata(void *data)
@@ -1833,6 +1848,8 @@ static void nfs4_free_closedata(void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state_owner *sp = calldata->state->owner;
+        if (calldata->roc)
+                pnfs_roc_release(calldata->state->inode);
        nfs4_put_open_state(calldata->state);
        nfs_free_seqid(calldata->arg.seqid);
        nfs4_put_state_owner(sp);
@@ -1865,6 +1882,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
         */
        switch (task->tk_status) {
                case 0:
+                        if (calldata->roc)
+                                pnfs_roc_set_barrier(state->inode,
+                                                     calldata->roc_barrier);
                        nfs_set_open_stateid(state, &calldata->res.stateid, 0);
                        renew_lease(server, calldata->timestamp);
                        nfs4_close_clear_stateid_flags(state,
@@ -1917,8 +1937,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                return;
        }
-        if (calldata->arg.fmode == 0)
+        if (calldata->arg.fmode == 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+                if (calldata->roc &&
+                    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+                        rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+                                     task, NULL);
+                        return;
+                }
+        }
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
@@ -1946,7 +1973,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
 *
 * NOTE: Caller must be holding the sp->so_owner semaphore!
 */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_closedata *calldata;
@@ -1981,11 +2008,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
+        calldata->roc = roc;
        path_get(path);
        calldata->path = *path;
-        msg.rpc_argp = &calldata->arg,
+        msg.rpc_argp = &calldata->arg;
-        msg.rpc_resp = &calldata->res,
+        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -1998,6 +2026,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 out_free_calldata:
        kfree(calldata);
 out:
+        if (roc)
+                pnfs_roc_release(state->inode);
        nfs4_put_open_state(state);
        nfs4_put_state_owner(sp);
        return status;
@@ -2486,6 +2516,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                path = &ctx->path;
                fmode = ctx->mode;
        }
+        sattr->ia_mode &= ~current_umask();
        state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
@@ -2816,6 +2847,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mkdir(dir, dentry, sattr),
@@ -2916,6 +2949,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 {
        struct nfs4_exception exception = { };
        int err;
+        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_mknod(dir, dentry, sattr, rdev),
@@ -3478,6 +3513,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        struct nfs4_setclientid setclientid = {
                .sc_verifier = &sc_verifier,
                .sc_prog = program,
+                .sc_cb_ident = clp->cl_cb_ident,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -3517,7 +3553,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                if (signalled())
                        break;
                if (loop++ & 1)
-                        ssleep(clp->cl_lease_time + 1);
+                        ssleep(clp->cl_lease_time / HZ + 1);
                else
                        if (++clp->cl_id_uniquifier == 0)
                                break;
@@ -3663,8 +3699,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        data->rpc_status = 0;
        task_setup_data.callback_data = data;
-        msg.rpc_argp = &data->args,
+        msg.rpc_argp = &data->args;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -3743,6 +3779,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
        arg.lock_owner.id = lsp->ls_id.id;
+        arg.lock_owner.s_dev = server->s_dev;
        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        switch (status) {
                case 0:
@@ -3908,8 +3945,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                return ERR_PTR(-ENOMEM);
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        return rpc_run_task(&task_setup_data);
 }
@@ -3988,6 +4025,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.lock_stateid = &lsp->ls_stateid;
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
+        p->arg.lock_owner.s_dev = server->s_dev;
        p->res.lock_seqid = p->arg.lock_seqid;
        p->lsp = lsp;
        p->server = server;
@@ -4145,8 +4183,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                        data->arg.reclaim = NFS_LOCK_RECLAIM;
                task_setup_data.callback_ops = &nfs4_recover_lock_ops;
        }
-        msg.rpc_argp = &data->arg,
+        msg.rpc_argp = &data->arg;
-        msg.rpc_resp = &data->res,
+        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -4392,48 +4430,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
                return;
        args->lock_owner.clientid = server->nfs_client->cl_clientid;
        args->lock_owner.id = lsp->ls_id.id;
+        args->lock_owner.s_dev = server->s_dev;
        msg.rpc_argp = args;
        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
 }
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
-int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
+static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
-                size_t buflen, int flags)
+                                   const void *buf, size_t buflen,
+                                   int flags, int type)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_set_acl(inode, buf, buflen);
+        return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
 }
-/* The getxattr man page suggests returning -ENODATA for unknown attributes,
+static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
- * and that's what we'll do for e.g. user attributes that haven't been set.
+                                   void *buf, size_t buflen, int type)
- * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
- * attributes in kernel-managed attribute namespaces. */
-ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
-                size_t buflen)
 {
-        struct inode *inode = dentry->d_inode;
+        if (strcmp(key, "") != 0)
+                return -EINVAL;
-        if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
-                return -EOPNOTSUPP;
-        return nfs4_proc_get_acl(inode, buf, buflen);
+        return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
 }
-ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
+static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+                                       size_t list_len, const char *name,
+                                       size_t name_len, int type)
 {
-        size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
+        size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
        if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
                return 0;
-        if (buf && buflen < len)
-                return -ERANGE;
+        if (list && len <= list_len)
-        if (buf)
+                memcpy(list, XATTR_NAME_NFSV4_ACL, len);
-                memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
        return len;
 }
@@ -4486,6 +4519,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 #ifdef CONFIG_NFS_V4_1
 /*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags)
+{
+        if (flags & ~EXCHGID4_FLAG_MASK_R)
+                goto out_inval;
+        if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+            (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+                goto out_inval;
+        if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+                goto out_inval;
+        return NFS_OK;
+out_inval:
+        return -NFS4ERR_INVAL;
+}
+/*
 * nfs4_proc_exchange_id()
 *
 * Since the clientid has expired, all compounds using sessions
@@ -4498,7 +4550,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        nfs4_verifier verifier;
        struct nfs41_exchange_id_args args = {
                .client = clp,
-                .flags = clp->cl_exchange_flags,
+                .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
        };
        struct nfs41_exchange_id_res res = {
                .client = clp,
@@ -4515,9 +4567,6 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        dprintk("--> %s\n", __func__);
        BUG_ON(clp == NULL);
-        /* Remove server-only flags */
-        args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
        p = (u32 *)verifier.data;
        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
@@ -4543,6 +4592,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
                        break;
        }
+        status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
        dprintk("<-- %s status= %d\n", __func__, status);
        return status;
 }
@@ -4776,17 +4826,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        if (!session)
                return NULL;
-        init_completion(&session->complete);
        tbl = &session->fc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        init_completion(&tbl->complete);
        tbl = &session->bc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        init_completion(&tbl->complete);
        session->session_state = 1<<NFS4_SESSION_INITING;
@@ -5280,13 +5330,23 @@ static void
 nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutget *lgp = calldata;
-        struct inode *ino = lgp->args.inode;
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
-        struct nfs_server *server = NFS_SERVER(ino);
        dprintk("--> %s\n", __func__);
+        /* Note the is a race here, where a CB_LAYOUTRECALL can come in
+         * right now covering the LAYOUTGET we are about to send.
+         * However, that is not so catastrophic, and there seems
+         * to be no way to prevent it completely.
+         */
        if (nfs4_setup_sequence(server, &lgp->args.seq_args,
                                &lgp->res.seq_res, 0, task))
                return;
+        if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+                                          NFS_I(lgp->args.inode)->layout,
+                                          lgp->args.ctx->state)) {
+                rpc_exit(task, NFS4_OK);
+                return;
+        }
        rpc_call_start(task);
 }
@@ -5313,7 +5373,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
                        return;
                }
        }
-        lgp->status = task->tk_status;
        dprintk("<-- %s\n", __func__);
 }
@@ -5322,7 +5381,6 @@ static void nfs4_layoutget_release(void *calldata)
        struct nfs4_layoutget *lgp = calldata;
        dprintk("--> %s\n", __func__);
-        put_layout_hdr(lgp->args.inode);
        if (lgp->res.layout.buf != NULL)
                free_page((unsigned long) lgp->res.layout.buf);
        put_nfs_open_context(lgp->args.ctx);
@@ -5367,13 +5425,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
        if (IS_ERR(task))
                return PTR_ERR(task);
        status = nfs4_wait_for_completion_rpc_task(task);
-        if (status != 0)
+        if (status == 0)
-                goto out;
+                status = task->tk_status;
-        status = lgp->status;
+        if (status == 0)
-        if (status != 0)
+                status = pnfs_layout_process(lgp);
-                goto out;
-        status = pnfs_layout_process(lgp);
-out:
        rpc_put_task(task);
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
@@ -5504,9 +5559,10 @@ static const struct inode_operations nfs4_file_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
-        .getxattr       = nfs4_getxattr,
+        .getxattr       = generic_getxattr,
-        .setxattr       = nfs4_setxattr,
+        .setxattr       = generic_setxattr,
-        .listxattr      = nfs4_listxattr,
+        .listxattr      = generic_listxattr,
+        .removexattr    = generic_removexattr,
 };
 const struct nfs_rpc_ops nfs_v4_clientops = {
@@ -5551,6 +5607,18 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .open_context   = nfs4_atomic_open,
 };
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+        .prefix = XATTR_NAME_NFSV4_ACL,
+        .list   = nfs4_xattr_list_nfs4_acl,
+        .get    = nfs4_xattr_get_nfs4_acl,
+        .set    = nfs4_xattr_set_nfs4_acl,
+};
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+        &nfs4_xattr_nfs4_acl_handler,
+        NULL
+};
 /*
 * Local variables:
 *  c-basic-offset: 8
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 72b6c580af13..402143d75fc5 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work)
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
-        /* Are there any active superblocks? */
-        if (list_empty(&clp->cl_superblocks))
+        rcu_read_lock();
+        if (list_empty(&clp->cl_superblocks)) {
+                rcu_read_unlock();
                goto out;
+        }
+        rcu_read_unlock();
        spin_lock(&clp->cl_lock);
        lease = clp->cl_lease_time;
        last = clp->cl_last_renewal;
@@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
                cred = ops->get_state_renewal_cred_locked(clp);
                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        if (list_empty(&clp->cl_delegations)) {
+                        if (!nfs_delegations_present(clp)) {
                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                                goto out;
                        }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f575a3126737..2336d532cf66 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -105,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
                put_rpccred(cred);
 }
-struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
 {
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
-        struct rpc_cred *cred = NULL;
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                if (list_empty(&sp->so_states))
                        continue;
                cred = get_rpccred(sp->so_cred);
@@ -121,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
        return cred;
 }
+/**
+ * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
+{
+        struct rpc_cred *cred = NULL;
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                cred = nfs4_get_renew_cred_server_locked(server);
+                if (cred != NULL)
+                        break;
+        }
+        rcu_read_unlock();
+        return cred;
+}
 #if defined(CONFIG_NFS_V4_1)
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -142,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        return status;
 }
+/*
+ * Back channel returns NFS4ERR_DELAY for new requests when
+ * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
+ * is ended.
+ */
 static void nfs4_end_drain_session(struct nfs_client *clp)
 {
        struct nfs4_session *ses = clp->cl_session;
@@ -165,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
        }
 }
-static int nfs4_begin_drain_session(struct nfs_client *clp)
+static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
-        struct nfs4_session *ses = clp->cl_session;
-        struct nfs4_slot_table *tbl = &ses->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
        if (tbl->highest_used_slotid != -1) {
-                INIT_COMPLETION(ses->complete);
+                INIT_COMPLETION(tbl->complete);
                spin_unlock(&tbl->slot_tbl_lock);
-                return wait_for_completion_interruptible(&ses->complete);
+                return wait_for_completion_interruptible(&tbl->complete);
        }
        spin_unlock(&tbl->slot_tbl_lock);
        return 0;
 }
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+        struct nfs4_session *ses = clp->cl_session;
+        int ret = 0;
+        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
+        /* back channel */
+        ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+        if (ret)
+                return ret;
+        /* fore channel */
+        return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+}
 int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
        int status;
@@ -192,6 +232,12 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
        status = nfs4_proc_create_session(clp);
        if (status != 0)
                goto out;
+        status = nfs4_set_callback_sessionid(clp);
+        if (status != 0) {
+                printk(KERN_WARNING "Sessionid not set. No callback service\n");
+                nfs_callback_down(1);
+                status = 0;
+        }
        nfs41_setup_state_renewal(clp);
        nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
@@ -210,28 +256,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 #endif /* CONFIG_NFS_V4_1 */
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+static struct rpc_cred *
+nfs4_get_setclientid_cred_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
+        struct rpc_cred *cred = NULL;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
+        spin_lock(&clp->cl_lock);
+        pos = rb_first(&server->state_owners);
+        if (pos != NULL) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+                cred = get_rpccred(sp->so_cred);
+        }
+        spin_unlock(&clp->cl_lock);
+        return cred;
+}
+/**
+ * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ */
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+{
+        struct nfs_server *server;
        struct rpc_cred *cred;
        spin_lock(&clp->cl_lock);
        cred = nfs4_get_machine_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
        if (cred != NULL)
                goto out;
-        pos = rb_first(&clp->cl_state_owners);
-        if (pos != NULL) {
+        rcu_read_lock();
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                cred = get_rpccred(sp->so_cred);
+                cred = nfs4_get_setclientid_cred_server(server);
+                if (cred != NULL)
+                        break;
        }
+        rcu_read_unlock();
 out:
-        spin_unlock(&clp->cl_lock);
        return cred;
 }
-static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
+static void nfs_alloc_unique_id_locked(struct rb_root *root,
-                __u64 minval, int maxbits)
+                                       struct nfs_unique_id *new,
+                                       __u64 minval, int maxbits)
 {
        struct rb_node **p, *parent;
        struct nfs_unique_id *pos;
@@ -286,16 +360,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
 }
 static struct nfs4_state_owner *
-nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
-        struct nfs_client *clp = server->nfs_client;
+        struct rb_node **p = &server->state_owners.rb_node,
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp, *res = NULL;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
                if (server < sp->so_server) {
                        p = &parent->rb_left;
@@ -319,24 +392,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 }
 static struct nfs4_state_owner *
-nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
 {
-        struct rb_node **p = &clp->cl_state_owners.rb_node,
+        struct nfs_server *server = new->so_server;
+        struct rb_node **p = &server->state_owners.rb_node,
                       *parent = NULL;
        struct nfs4_state_owner *sp;
        while (*p != NULL) {
                parent = *p;
-                sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
-                if (new->so_server < sp->so_server) {
-                        p = &parent->rb_left;
-                        continue;
-                }
-                if (new->so_server > sp->so_server) {
-                        p = &parent->rb_right;
-                        continue;
-                }
                if (new->so_cred < sp->so_cred)
                        p = &parent->rb_left;
                else if (new->so_cred > sp->so_cred)
@@ -346,18 +412,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
                        return sp;
                }
        }
-        nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->openowner_id,
-        rb_link_node(&new->so_client_node, parent, p);
+                                        &new->so_owner_id, 1, 64);
-        rb_insert_color(&new->so_client_node, &clp->cl_state_owners);
+        rb_link_node(&new->so_server_node, parent, p);
+        rb_insert_color(&new->so_server_node, &server->state_owners);
        return new;
 }
 static void
-nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node))
+        struct nfs_server *server = sp->so_server;
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
-        nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id);
+        if (!RB_EMPTY_NODE(&sp->so_server_node))
+                rb_erase(&sp->so_server_node, &server->state_owners);
+        nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
 }
 /*
@@ -386,23 +455,32 @@ nfs4_alloc_state_owner(void)
 static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-        if (!RB_EMPTY_NODE(&sp->so_client_node)) {
+        if (!RB_EMPTY_NODE(&sp->so_server_node)) {
-                struct nfs_client *clp = sp->so_server->nfs_client;
+                struct nfs_server *server = sp->so_server;
+                struct nfs_client *clp = server->nfs_client;
                spin_lock(&clp->cl_lock);
-                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
+                rb_erase(&sp->so_server_node, &server->state_owners);
-                RB_CLEAR_NODE(&sp->so_client_node);
+                RB_CLEAR_NODE(&sp->so_server_node);
                spin_unlock(&clp->cl_lock);
        }
 }
-struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+                                              struct rpc_cred *cred)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp, *new;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_find_state_owner(server, cred);
+        sp = nfs4_find_state_owner_locked(server, cred);
        spin_unlock(&clp->cl_lock);
        if (sp != NULL)
                return sp;
@@ -412,7 +490,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        new->so_server = server;
        new->so_cred = cred;
        spin_lock(&clp->cl_lock);
-        sp = nfs4_insert_state_owner(clp, new);
+        sp = nfs4_insert_state_owner_locked(new);
        spin_unlock(&clp->cl_lock);
        if (sp == new)
                get_rpccred(cred);
@@ -423,6 +501,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        return sp;
 }
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
        struct nfs_client *clp = sp->so_server->nfs_client;
@@ -430,7 +513,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
        if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
                return;
-        nfs4_remove_state_owner(clp, sp);
+        nfs4_remove_state_owner_locked(sp);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&sp->so_sequence.wait);
        put_rpccred(cred);
@@ -585,8 +668,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
        if (!call_close) {
                nfs4_put_open_state(state);
                nfs4_put_state_owner(owner);
-        } else
+        } else {
-                nfs4_do_close(path, state, gfp_mask, wait);
+                bool roc = pnfs_roc(state->inode);
+                nfs4_do_close(path, state, gfp_mask, wait, roc);
+        }
 }
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
@@ -633,7 +719,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
 static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
        struct nfs4_lock_state *lsp;
-        struct nfs_client *clp = state->owner->so_server->nfs_client;
+        struct nfs_server *server = state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
@@ -657,7 +744,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
                return NULL;
        }
        spin_lock(&clp->cl_lock);
-        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
+        nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
        spin_unlock(&clp->cl_lock);
        INIT_LIST_HEAD(&lsp->ls_locks);
        return lsp;
@@ -665,10 +752,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-        struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
+        struct nfs_server *server = lsp->ls_state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
        spin_lock(&clp->cl_lock);
-        nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
+        nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
        spin_unlock(&clp->cl_lock);
        rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
        kfree(lsp);
@@ -1114,15 +1202,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
        }
 }
-static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+static void nfs4_reset_seqids(struct nfs_server *server,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        /* Reset all sequence ids to zero */
+        spin_lock(&clp->cl_lock);
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        for (pos = rb_first(&server->state_owners);
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+             pos != NULL;
+             pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                sp->so_seqid.flags = 0;
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1131,6 +1223,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+        int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+        struct nfs_server *server;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_reset_seqids(server, mark_reclaim);
+        rcu_read_unlock();
 }
 static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1148,25 +1252,41 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
                (void)ops->reclaim_complete(clp);
 }
-static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+        spin_lock(&clp->cl_lock);
-                return 0;
+        for (pos = rb_first(&server->state_owners);
+             pos != NULL;
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+             pos = rb_next(pos)) {
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
-                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+                                                &state->flags))
                                continue;
                        nfs4_state_mark_reclaim_nograce(clp, state);
                }
                spin_unlock(&sp->so_lock);
        }
+        spin_unlock(&clp->cl_lock);
+}
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+        struct nfs_server *server;
+        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+                return 0;
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                nfs4_clear_reclaim_server(server);
+        rcu_read_unlock();
        nfs_delegation_reap_unclaimed(clp);
        return 1;
@@ -1238,27 +1358,40 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
 {
+        struct nfs4_state_owner *sp;
+        struct nfs_server *server;
        struct rb_node *pos;
        int status = 0;
 restart:
-        spin_lock(&clp->cl_lock);
+        rcu_read_lock();
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                spin_lock(&clp->cl_lock);
-                if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
+                for (pos = rb_first(&server->state_owners);
-                        continue;
+                     pos != NULL;
-                atomic_inc(&sp->so_count);
+                     pos = rb_next(pos)) {
-                spin_unlock(&clp->cl_lock);
+                        sp = rb_entry(pos,
-                status = nfs4_reclaim_open_state(sp, ops);
+                                struct nfs4_state_owner, so_server_node);
-                if (status < 0) {
+                        if (!test_and_clear_bit(ops->owner_flag_bit,
-                        set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                                        &sp->so_flags))
+                                continue;
+                        atomic_inc(&sp->so_count);
+                        spin_unlock(&clp->cl_lock);
+                        rcu_read_unlock();
+                        status = nfs4_reclaim_open_state(sp, ops);
+                        if (status < 0) {
+                                set_bit(ops->owner_flag_bit, &sp->so_flags);
+                                nfs4_put_state_owner(sp);
+                                return nfs4_recovery_handle_error(clp, status);
+                        }
                        nfs4_put_state_owner(sp);
-                        return nfs4_recovery_handle_error(clp, status);
+                        goto restart;
                }
-                nfs4_put_state_owner(sp);
+                spin_unlock(&clp->cl_lock);
-                goto restart;
        }
-        spin_unlock(&clp->cl_lock);
+        rcu_read_unlock();
        return status;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9f1826b012e6..2ab8e5cb8f59 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -71,8 +71,8 @@ static int nfs4_stat_to_errno(int);
 /* lock,open owner id:
 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
 */
-#define open_owner_id_maxsz     (1 + 4)
+#define open_owner_id_maxsz     (1 + 1 + 4)
-#define lock_owner_id_maxsz     (1 + 4)
+#define lock_owner_id_maxsz     (1 + 1 + 4)
 #define decode_lockowner_maxsz  (1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define compound_encode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
 #define compound_decode_hdr_maxsz       (3 + (NFS4_MAXTAGLEN >> 2))
@@ -1088,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
 {
        __be32 *p;
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, lowner->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+        *p++ = cpu_to_be32(lowner->s_dev);
        xdr_encode_hyper(p, lowner->id);
 }
@@ -1210,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
        *p++ = cpu_to_be32(OP_OPEN);
        *p = cpu_to_be32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
-        p = reserve_space(xdr, 28);
+        p = reserve_space(xdr, 32);
        p = xdr_encode_hyper(p, arg->clientid);
-        *p++ = cpu_to_be32(16);
+        *p++ = cpu_to_be32(20);
        p = xdr_encode_opaque_fixed(p, "open id:", 8);
+        *p++ = cpu_to_be32(arg->server->s_dev);
        xdr_encode_hyper(p, arg->id);
 }
@@ -1510,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_restorefh_maxsz;
 }
-static int
+static void
 encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1521,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
        p = reserve_space(xdr, 2*4);
        *p++ = cpu_to_be32(1);
        *p = cpu_to_be32(FATTR4_WORD0_ACL);
-        if (arg->acl_len % 4)
+        BUG_ON(arg->acl_len % 4);
-                return -EINVAL;
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
        hdr->nops++;
        hdr->replen += decode_setacl_maxsz;
-        return 0;
 }
 static void
@@ -1789,7 +1789,6 @@ encode_layoutget(struct xdr_stream *xdr,
                      const struct nfs4_layoutget_args *args,
                      struct compound_hdr *hdr)
 {
-        nfs4_stateid stateid;
        __be32 *p;
        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1800,9 +1799,7 @@ encode_layoutget(struct xdr_stream *xdr,
        p = xdr_encode_hyper(p, args->range.offset);
        p = xdr_encode_hyper(p, args->range.length);
        p = xdr_encode_hyper(p, args->minlength);
-        pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+        p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
-                                args->ctx->state);
-        p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
        *p = cpu_to_be32(args->maxcount);
        dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
@@ -1833,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 /*
 * Encode an ACCESS request
 */
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_accessargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_access(xdr, args->access, &hdr);
-        encode_access(&xdr, args->access, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP request
 */
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_lookup_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LOOKUP_ROOT request
 */
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_lookup_root_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode REMOVE request
 */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_removeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_remove(xdr, &args->name, &hdr);
-        encode_remove(&xdr, &args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode RENAME request
 */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs_renameargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->old_dir, &hdr);
-        encode_putfh(&xdr, args->old_dir, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->new_dir, &hdr);
-        encode_putfh(&xdr, args->new_dir, &hdr);
+        encode_rename(xdr, args->old_name, args->new_name, &hdr);
-        encode_rename(&xdr, args->old_name, args->new_name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode LINK request
 */
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+                             const struct nfs4_link_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_link(xdr, args->name, &hdr);
-        encode_link(&xdr, args->name, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode CREATE request
 */
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_create_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_create(xdr, args, &hdr);
-        encode_create(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode SYMLINK request
 */
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_create_arg *args)
 {
-        return nfs4_xdr_enc_create(req, p, args);
+        nfs4_xdr_enc_create(req, xdr, args);
 }
 /*
 * Encode GETATTR request
 */
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_getattr_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a CLOSE request
 */
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_close(xdr, args, &hdr);
-        encode_close(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request
 */
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_savefh(xdr, &hdr);
-        encode_savefh(&xdr, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfh(xdr, &hdr);
-        encode_getfh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_restorefh(xdr, &hdr);
-        encode_restorefh(&xdr, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_CONFIRM request
 */
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs_open_confirmargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_confirm(xdr, args, &hdr);
-        encode_open_confirm(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN request with no attributes.
 */
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_openargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open(xdr, args, &hdr);
-        encode_open(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an OPEN_DOWNGRADE request
 */
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+                                        struct xdr_stream *xdr,
+                                        struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_open_downgrade(xdr, args, &hdr);
-        encode_open_downgrade(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCK request
 */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_lock_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lock(xdr, args, &hdr);
-        encode_lock(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKT request
 */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_lockt_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_lockt(xdr, args, &hdr);
-        encode_lockt(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a LOCKU request
 */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_locku_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_locku(xdr, args, &hdr);
-        encode_locku(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
-static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+                                           struct xdr_stream *xdr,
+                                        struct nfs_release_lockowner_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_release_lockowner(xdr, &args->lock_owner, &hdr);
-        encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READLINK request
 */
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_readlink *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readlink(xdr, args, req, &hdr);
-        encode_readlink(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                        args->pgbase, args->pglen);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READDIR request
 */
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 const struct nfs4_readdir_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_readdir(xdr, args, req, &hdr);
-        encode_readdir(&xdr, args, req, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
                         args->pgbase, args->count);
@@ -2227,428 +2193,387 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
                        __func__, hdr.replen << 2, args->pages,
                        args->pgbase, args->count);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a READ request
 */
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+                              struct nfs_readargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_read(xdr, args, &hdr);
-        encode_read(&xdr, args, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode an SETATTR request
 */
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 struct nfs_setattrargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setattr(xdr, args, args->server, &hdr);
-        encode_setattr(&xdr, args, args->server, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a GETACL request
 */
-static int
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
+                                struct nfs_getaclargs *args)
-                struct nfs_getaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
        replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
-        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
+        encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
                args->acl_pages, args->acl_pgbase, args->acl_len);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode a WRITE request
 */
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_write(xdr, args, &hdr);
-        encode_write(&xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        encode_getfattr(&xdr, args->bitmask, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 *  a COMMIT request
 */
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs_writeargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_commit(xdr, args, &hdr);
-        encode_commit(&xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * FSINFO request
 */
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                struct nfs4_fsinfo_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_fsinfo(xdr, args->bitmask, &hdr);
-        encode_fsinfo(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a PATHCONF request
 */
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                  const struct nfs4_pathconf_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
-        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
                           &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a STATFS request
 */
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                const struct nfs4_statfs_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
-        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * GETATTR_BITMAP request
 */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p,
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
-                                    struct nfs4_server_caps_arg *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_server_caps_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
                           FATTR4_WORD0_LINK_SUPPORT|
                           FATTR4_WORD0_SYMLINK_SUPPORT|
                           FATTR4_WORD0_ACLSUPPORT, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RENEW request
 */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+                               struct nfs_client *clp)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_renew(xdr, clp, &hdr);
-        encode_renew(&xdr, clp, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID request
 */
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_setclientid *sc)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid(xdr, sc, &hdr);
-        encode_setclientid(&xdr, sc, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+                                             struct xdr_stream *xdr,
+                                             struct nfs4_setclientid_res *arg)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .nops   = 0,
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_setclientid_confirm(xdr, arg, &hdr);
-        encode_setclientid_confirm(&xdr, arg, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * DELEGRETURN request
 */
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_delegreturnargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fhandle, &hdr);
-        encode_putfh(&xdr, args->fhandle, &hdr);
+        encode_delegreturn(xdr, args->stateid, &hdr);
-        encode_delegreturn(&xdr, args->stateid, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
-        encode_getfattr(&xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode FS_LOCATIONS request
 */
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_fs_locations_arg *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
        uint32_t replen;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
-        encode_putfh(&xdr, args->dir_fh, &hdr);
+        encode_lookup(xdr, args->name, &hdr);
-        encode_lookup(&xdr, args->name, &hdr);
        replen = hdr.replen;    /* get the attribute into args->page */
-        encode_fs_locations(&xdr, args->bitmask, &hdr);
+        encode_fs_locations(xdr, args->bitmask, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
                        0, PAGE_SIZE);
        encode_nops(&hdr);
-        return 0;
 }
 #if defined(CONFIG_NFS_V4_1)
 /*
 * EXCHANGE_ID request
 */
-static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
-                                    struct nfs41_exchange_id_args *args)
+                                     struct xdr_stream *xdr,
+                                     struct nfs41_exchange_id_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_exchange_id(xdr, args, &hdr);
-        encode_exchange_id(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a CREATE_SESSION request
 */
-static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
-                                       struct nfs41_create_session_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs41_create_session_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = args->client->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_create_session(xdr, args, &hdr);
-        encode_create_session(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a DESTROY_SESSION request
 */
-static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
-                                        struct nfs4_session *session)
+                                         struct xdr_stream *xdr,
+                                         struct nfs4_session *session)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = session->clp->cl_mvops->minor_version,
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_destroy_session(xdr, session, &hdr);
-        encode_destroy_session(&xdr, session, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a SEQUENCE request
 */
-static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
-                                 struct nfs4_sequence_args *args)
+                                  struct nfs4_sequence_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, args, &hdr);
-        encode_sequence(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a GET_LEASE_TIME request
 */
-static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
-                                       struct nfs4_get_lease_time_args *args)
+                                        struct xdr_stream *xdr,
+                                        struct nfs4_get_lease_time_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->la_seq_args, &hdr);
-        encode_sequence(&xdr, &args->la_seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
-        encode_putrootfh(&xdr, &hdr);
+        encode_fsinfo(xdr, lease_bitmap, &hdr);
-        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * a RECLAIM_COMPLETE request
 */
-static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
-                                     struct nfs41_reclaim_complete_args *args)
+                                          struct xdr_stream *xdr,
+                                struct nfs41_reclaim_complete_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args)
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_reclaim_complete(xdr, args, &hdr);
-        encode_reclaim_complete(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 * Encode GETDEVICEINFO request
 */
-static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
-                                      struct nfs4_getdeviceinfo_args *args)
+                                       struct xdr_stream *xdr,
+                                       struct nfs4_getdeviceinfo_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_getdeviceinfo(xdr, args, &hdr);
-        encode_getdeviceinfo(&xdr, args, &hdr);
        /* set up reply kvec. Subtract notification bitmap max size (2)
         * so that notification bitmap is put in xdr_buf tail */
@@ -2657,27 +2582,24 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
                         args->pdev->pglen);
        encode_nops(&hdr);
-        return 0;
 }
 /*
 *  Encode LAYOUTGET request
 */
-static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
-                                  struct nfs4_layoutget_args *args)
+                                   struct xdr_stream *xdr,
+                                   struct nfs4_layoutget_args *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
-        encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
+        encode_layoutget(xdr, args, &hdr);
-        encode_layoutget(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return 0;
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -4475,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
                goto out_overflow;
        eof = be32_to_cpup(p++);
        count = be32_to_cpup(p);
-        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
+        hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
                dprintk("NFS: server cheating in read reply: "
@@ -5000,7 +4922,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
                goto out_overflow;
        len = be32_to_cpup(p);
        if (len) {
-                int i;
+                uint32_t i;
                p = xdr_inline_decode(xdr, 4 * len);
                if (unlikely(!p))
@@ -5090,26 +5012,26 @@ out_overflow:
 /*
 * Decode OPEN_DOWNGRADE response
 */
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
+                                       struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_downgrade(&xdr, res);
+        status = decode_open_downgrade(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5118,26 +5040,25 @@ out:
 /*
 * Decode ACCESS response
 */
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_accessres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_access(&xdr, res);
+        status = decode_access(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5146,26 +5067,28 @@ out:
 /*
 * Decode LOOKUP response
 */
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server
+        status = decode_getfattr(xdr, res->fattr, res->server
                        ,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5174,23 +5097,25 @@ out:
 /*
 * Decode LOOKUP_ROOT response
 */
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_lookup_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putrootfh(&xdr)) != 0)
+        status = decode_putrootfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) == 0)
+        status = decode_getfh(xdr, res->fh);
-                status = decode_getfattr(&xdr, res->fattr, res->server,
+        if (status == 0)
+                status = decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5199,24 +5124,25 @@ out:
 /*
 * Decode REMOVE response
 */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_removeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
+        status = decode_remove(xdr, &res->cinfo);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5225,34 +5151,38 @@ out:
 /*
 * Decode RENAME response
 */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_renameres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
+        status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+        if (status)
                goto out;
        /* Current FH is target directory */
-        if (decode_getfattr(&xdr, res->new_fattr, res->server,
+        if (decode_getfattr(xdr, res->new_fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->old_fattr, res->server,
+        decode_getfattr(xdr, res->old_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5261,37 +5191,41 @@ out:
 /*
 * Decode LINK response
 */
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs4_link_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_link(&xdr, &res->cinfo)) != 0)
+        status = decode_link(xdr, &res->cinfo);
+        if (status)
                goto out;
        /*
         * Note order: OP_LINK leaves the directory as the current
         *             filehandle.
         */
-        if (decode_getfattr(&xdr, res->dir_attr, res->server,
+        if (decode_getfattr(xdr, res->dir_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5300,33 +5234,37 @@ out:
 /*
 * Decode CREATE response
 */
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs4_create_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_savefh(&xdr)) != 0)
+        status = decode_savefh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0)
+        status = decode_create(xdr, &res->dir_cinfo);
+        if (status)
                goto out;
-        if ((status = decode_getfh(&xdr, res->fh)) != 0)
+        status = decode_getfh(xdr, res->fh);
+        if (status)
                goto out;
-        if (decode_getfattr(&xdr, res->fattr, res->server,
+        if (decode_getfattr(xdr, res->fattr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if ((status = decode_restorefh(&xdr)) != 0)
+        status = decode_restorefh(xdr);
+        if (status)
                goto out;
-        decode_getfattr(&xdr, res->dir_fattr, res->server,
+        decode_getfattr(xdr, res->dir_fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5335,31 +5273,31 @@ out:
 /*
 * Decode SYMLINK response
 */
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_create_res *res)
 {
-        return nfs4_xdr_dec_create(rqstp, p, res);
+        return nfs4_xdr_dec_create(rqstp, xdr, res);
 }
 /*
 * Decode GETATTR response
 */
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_getattr_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server,
+        status = decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5368,46 +5306,40 @@ out:
 /*
 * Encode an SETACL request
 */
-static int
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
+                                struct nfs_setaclargs *args)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
-        int status;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(xdr, req, &hdr);
-        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
-        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
-        encode_putfh(&xdr, args->fh, &hdr);
+        encode_setacl(xdr, args, &hdr);
-        status = encode_setacl(&xdr, args, &hdr);
        encode_nops(&hdr);
-        return status;
 }
 /*
 * Decode SETACL response
 */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_setaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
 out:
        return status;
 }
@@ -5416,24 +5348,22 @@ out:
 * Decode GETACL response
 */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p,
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                    struct nfs_getaclres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getacl(&xdr, rqstp, &res->acl_len);
+        status = decode_getacl(xdr, rqstp, &res->acl_len);
 out:
        return status;
@@ -5442,23 +5372,22 @@ out:
 /*
 * Decode CLOSE response
 */
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_close(&xdr, res);
+        status = decode_close(xdr, res);
        if (status != 0)
                goto out;
        /*
@@ -5467,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
         *      an ESTALE error. Shouldn't be a problem,
         *      though, since fattr->valid will remain unset.
         */
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5476,36 +5405,35 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_savefh(&xdr);
+        status = decode_savefh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        if (decode_getfh(&xdr, &res->fh) != 0)
+        if (decode_getfh(xdr, &res->fh) != 0)
                goto out;
-        if (decode_getfattr(&xdr, res->f_attr, res->server,
+        if (decode_getfattr(xdr, res->f_attr, res->server,
                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
-        if (decode_restorefh(&xdr) != 0)
+        if (decode_restorefh(xdr) != 0)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5514,20 +5442,20 @@ out:
 /*
 * Decode OPEN_CONFIRM response
 */
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs_open_confirmres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open_confirm(&xdr, res);
+        status = decode_open_confirm(xdr, res);
 out:
        return status;
 }
@@ -5535,26 +5463,26 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->f_attr, res->server,
+        decode_getfattr(xdr, res->f_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5563,26 +5491,26 @@ out:
 /*
 * Decode SETATTR response
 */
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+                                struct xdr_stream *xdr,
+                                struct nfs_setattrres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_setattr(&xdr);
+        status = decode_setattr(xdr);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5591,23 +5519,22 @@ out:
 /*
 * Decode LOCK response
 */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_lock_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lock(&xdr, res);
+        status = decode_lock(xdr, res);
 out:
        return status;
 }
@@ -5615,23 +5542,22 @@ out:
 /*
 * Decode LOCKT response
 */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_lockt_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_lockt(&xdr, res);
+        status = decode_lockt(xdr, res);
 out:
        return status;
 }
@@ -5639,61 +5565,58 @@ out:
 /*
 * Decode LOCKU response
 */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_locku_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_locku(&xdr, res);
+        status = decode_locku(xdr, res);
 out:
        return status;
 }
-static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+                                          struct xdr_stream *xdr, void *dummy)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_release_lockowner(&xdr);
+                status = decode_release_lockowner(xdr);
        return status;
 }
 /*
 * Decode READLINK response
 */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p,
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_readlink_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readlink(&xdr, rqstp);
+        status = decode_readlink(xdr, rqstp);
 out:
        return status;
 }
@@ -5701,23 +5624,22 @@ out:
 /*
 * Decode READDIR response
 */
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                                struct nfs4_readdir_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_readdir(&xdr, rqstp, res);
+        status = decode_readdir(xdr, rqstp, res);
 out:
        return status;
 }
@@ -5725,23 +5647,22 @@ out:
 /*
 * Decode Read response
 */
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                             struct nfs_readres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_read(&xdr, rqstp, res);
+        status = decode_read(xdr, rqstp, res);
        if (!status)
                status = res->count;
 out:
@@ -5751,26 +5672,25 @@ out:
 /*
 * Decode WRITE response
 */
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_write(&xdr, res);
+        status = decode_write(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
@@ -5781,26 +5701,25 @@ out:
 /*
 * Decode COMMIT response
 */
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                               struct nfs_writeres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_commit(&xdr, res);
+        status = decode_commit(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5809,85 +5728,80 @@ out:
 /*
 * Decode FSINFO response
 */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_fsinfo_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->fsinfo);
+                status = decode_fsinfo(xdr, res->fsinfo);
        return status;
 }
 /*
 * Decode PATHCONF response
 */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
                                 struct nfs4_pathconf_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_pathconf(&xdr, res->pathconf);
+                status = decode_pathconf(xdr, res->pathconf);
        return status;
 }
 /*
 * Decode STATFS response
 */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
                               struct nfs4_statfs_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, req);
+                status = decode_sequence(xdr, &res->seq_res, req);
        if (!status)
-                status = decode_putfh(&xdr);
+                status = decode_putfh(xdr);
        if (!status)
-                status = decode_statfs(&xdr, res->fsstat);
+                status = decode_statfs(xdr, res->fsstat);
        return status;
 }
 /*
 * Decode GETATTR_BITMAP response
 */
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_server_caps_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        status = decode_server_caps(&xdr, res);
+        status = decode_server_caps(xdr, res);
 out:
        return status;
 }
@@ -5895,79 +5809,77 @@ out:
 /*
 * Decode RENEW response
 */
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+                              void *__unused)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_renew(&xdr);
+                status = decode_renew(xdr);
        return status;
 }
 /*
 * Decode SETCLIENTID response
 */
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
-                struct nfs4_setclientid_res *res)
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_setclientid_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid(&xdr, res);
+                status = decode_setclientid(xdr, res);
        return status;
 }
 /*
 * Decode SETCLIENTID_CONFIRM response
 */
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+                                            struct xdr_stream *xdr,
+                                            struct nfs_fsinfo *fsinfo)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid_confirm(&xdr);
+                status = decode_setclientid_confirm(xdr);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, fsinfo);
+                status = decode_fsinfo(xdr, fsinfo);
        return status;
 }
 /*
 * Decode DELEGRETURN response
 */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_delegreturnres *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_delegreturn(&xdr);
+        status = decode_delegreturn(xdr);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5976,26 +5888,27 @@ out:
 /*
 * Decode FS_LOCATIONS response
 */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
                                     struct nfs4_fs_locations_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, req);
+        status = decode_sequence(xdr, &res->seq_res, req);
        if (status)
                goto out;
-        if ((status = decode_putfh(&xdr)) != 0)
+        status = decode_putfh(xdr);
+        if (status)
                goto out;
-        if ((status = decode_lookup(&xdr)) != 0)
+        status = decode_lookup(xdr);
+        if (status)
                goto out;
-        xdr_enter_page(&xdr, PAGE_SIZE);
+        xdr_enter_page(xdr, PAGE_SIZE);
-        status = decode_getfattr(&xdr, &res->fs_locations->fattr,
+        status = decode_getfattr(xdr, &res->fs_locations->fattr,
                                 res->fs_locations->server,
                                 !RPC_IS_ASYNC(req->rq_task));
 out:
@@ -6006,129 +5919,122 @@ out:
 /*
 * Decode EXCHANGE_ID response
 */
-static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
                                    void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_exchange_id(&xdr, res);
+                status = decode_exchange_id(xdr, res);
        return status;
 }
 /*
 * Decode CREATE_SESSION response
 */
-static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs41_create_session_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_create_session(&xdr, res);
+                status = decode_create_session(xdr, res);
        return status;
 }
 /*
 * Decode DESTROY_SESSION response
 */
-static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
-                                        void *dummy)
+                                        struct xdr_stream *xdr,
+                                        void *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_destroy_session(&xdr, dummy);
+                status = decode_destroy_session(xdr, res);
        return status;
 }
 /*
 * Decode SEQUENCE response
 */
-static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+                                 struct xdr_stream *xdr,
                                 struct nfs4_sequence_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, res, rqstp);
+                status = decode_sequence(xdr, res, rqstp);
        return status;
 }
 /*
 * Decode GET_LEASE_TIME response
 */
-static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+                                       struct xdr_stream *xdr,
                                       struct nfs4_get_lease_time_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->lr_seq_res, rqstp);
+                status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
        if (!status)
-                status = decode_putrootfh(&xdr);
+                status = decode_putrootfh(xdr);
        if (!status)
-                status = decode_fsinfo(&xdr, res->lr_fsinfo);
+                status = decode_fsinfo(xdr, res->lr_fsinfo);
        return status;
 }
 /*
 * Decode RECLAIM_COMPLETE response
 */
-static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+                                         struct xdr_stream *xdr,
                                         struct nfs41_reclaim_complete_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_sequence(&xdr, &res->seq_res, rqstp);
+                status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (!status)
-                status = decode_reclaim_complete(&xdr, (void *)NULL);
+                status = decode_reclaim_complete(xdr, (void *)NULL);
        return status;
 }
 /*
 * Decode GETDEVINFO response
 */
-static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+                                      struct xdr_stream *xdr,
                                      struct nfs4_getdeviceinfo_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status != 0)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status != 0)
                goto out;
-        status = decode_getdeviceinfo(&xdr, res->pdev);
+        status = decode_getdeviceinfo(xdr, res->pdev);
 out:
        return status;
 }
@@ -6136,31 +6042,44 @@ out:
 /*
 * Decode LAYOUTGET response
 */
-static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
                                  struct nfs4_layoutget_res *res)
 {
-        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(xdr, &hdr);
-        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (status)
                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_layoutget(&xdr, rqstp, res);
+        status = decode_layoutget(xdr, rqstp, res);
 out:
        return status;
 }
 #endif /* CONFIG_NFS_V4_1 */
-__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+/**
-                           struct nfs_server *server, int plus)
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ *                      the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                       int plus)
 {
        uint32_t bitmap[2] = {0};
        uint32_t len;
@@ -6172,9 +6091,9 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                if (unlikely(!p))
                        goto out_overflow;
                if (!ntohl(*p++))
-                        return ERR_PTR(-EAGAIN);
+                        return -EAGAIN;
                entry->eof = 1;
-                return ERR_PTR(-EBADCOOKIE);
+                return -EBADCOOKIE;
        }
        p = xdr_inline_decode(xdr, 12);
@@ -6203,7 +6122,8 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (decode_attr_length(xdr, &len, &p) < 0)
                goto out_overflow;
-        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
+        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+                                        entry->server, 1) < 0)
                goto out_overflow;
        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
                entry->ino = entry->fattr->fileid;
@@ -6215,17 +6135,11 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (verify_attr_len(xdr, p, len) < 0)
                goto out_overflow;
-        p = xdr_inline_peek(xdr, 8);
+        return 0;
-        if (p != NULL)
-                entry->eof = !p[0] && p[1];
-        else
-                entry->eof = 0;
-        return p;
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EAGAIN);
+        return -EAGAIN;
 }
 /*
@@ -6301,8 +6215,8 @@ nfs4_stat_to_errno(int stat)
 #define PROC(proc, argtype, restype)                            \
 [NFSPROC4_CLNT_##proc] = {                                      \
        .p_proc   = NFSPROC4_COMPOUND,                          \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,            \
+        .p_encode = (kxdreproc_t)nfs4_xdr_##argtype,            \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,            \
+        .p_decode = (kxdrdproc_t)nfs4_xdr_##restype,            \
        .p_arglen = NFS4_##argtype##_sz,                        \
        .p_replen = NFS4_##restype##_sz,                        \
        .p_statidx = NFSPROC4_CLNT_##proc,                      \
@@ -6310,50 +6224,50 @@ nfs4_stat_to_errno(int stat)
 }
 struct rpc_procinfo     nfs4_procedures[] = {
-  PROC(READ,            enc_read,       dec_read),
+        PROC(READ,              enc_read,               dec_read),
-  PROC(WRITE,           enc_write,      dec_write),
+        PROC(WRITE,             enc_write,              dec_write),
-  PROC(COMMIT,          enc_commit,     dec_commit),
+        PROC(COMMIT,            enc_commit,             dec_commit),
-  PROC(OPEN,            enc_open,       dec_open),
+        PROC(OPEN,              enc_open,               dec_open),
-  PROC(OPEN_CONFIRM,    enc_open_confirm,       dec_open_confirm),
+        PROC(OPEN_CONFIRM,      enc_open_confirm,       dec_open_confirm),
-  PROC(OPEN_NOATTR,     enc_open_noattr,        dec_open_noattr),
+        PROC(OPEN_NOATTR,       enc_open_noattr,        dec_open_noattr),
-  PROC(OPEN_DOWNGRADE,  enc_open_downgrade,     dec_open_downgrade),
+        PROC(OPEN_DOWNGRADE,    enc_open_downgrade,     dec_open_downgrade),
-  PROC(CLOSE,           enc_close,      dec_close),
+        PROC(CLOSE,             enc_close,              dec_close),
-  PROC(SETATTR,         enc_setattr,    dec_setattr),
+        PROC(SETATTR,           enc_setattr,            dec_setattr),
-  PROC(FSINFO,          enc_fsinfo,     dec_fsinfo),
+        PROC(FSINFO,            enc_fsinfo,             dec_fsinfo),
-  PROC(RENEW,           enc_renew,      dec_renew),
+        PROC(RENEW,             enc_renew,              dec_renew),
-  PROC(SETCLIENTID,     enc_setclientid,        dec_setclientid),
+        PROC(SETCLIENTID,       enc_setclientid,        dec_setclientid),
-  PROC(SETCLIENTID_CONFIRM,     enc_setclientid_confirm,        dec_setclientid_confirm),
+        PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
-  PROC(LOCK,            enc_lock,       dec_lock),
+        PROC(LOCK,              enc_lock,               dec_lock),
-  PROC(LOCKT,           enc_lockt,      dec_lockt),
+        PROC(LOCKT,             enc_lockt,              dec_lockt),
-  PROC(LOCKU,           enc_locku,      dec_locku),
+        PROC(LOCKU,             enc_locku,              dec_locku),
-  PROC(ACCESS,          enc_access,     dec_access),
+        PROC(ACCESS,            enc_access,             dec_access),
-  PROC(GETATTR,         enc_getattr,    dec_getattr),
+        PROC(GETATTR,           enc_getattr,            dec_getattr),
-  PROC(LOOKUP,          enc_lookup,     dec_lookup),
+        PROC(LOOKUP,            enc_lookup,             dec_lookup),
-  PROC(LOOKUP_ROOT,     enc_lookup_root,        dec_lookup_root),
+        PROC(LOOKUP_ROOT,       enc_lookup_root,        dec_lookup_root),
-  PROC(REMOVE,          enc_remove,     dec_remove),
+        PROC(REMOVE,            enc_remove,             dec_remove),
-  PROC(RENAME,          enc_rename,     dec_rename),
+        PROC(RENAME,            enc_rename,             dec_rename),
-  PROC(LINK,            enc_link,       dec_link),
+        PROC(LINK,              enc_link,               dec_link),
-  PROC(SYMLINK,         enc_symlink,    dec_symlink),
+        PROC(SYMLINK,           enc_symlink,            dec_symlink),
-  PROC(CREATE,          enc_create,     dec_create),
+        PROC(CREATE,            enc_create,             dec_create),
-  PROC(PATHCONF,        enc_pathconf,   dec_pathconf),
+        PROC(PATHCONF,          enc_pathconf,           dec_pathconf),
-  PROC(STATFS,          enc_statfs,     dec_statfs),
+        PROC(STATFS,            enc_statfs,             dec_statfs),
-  PROC(READLINK,        enc_readlink,   dec_readlink),
+        PROC(READLINK,          enc_readlink,           dec_readlink),
-  PROC(READDIR,         enc_readdir,    dec_readdir),
+        PROC(READDIR,           enc_readdir,            dec_readdir),
-  PROC(SERVER_CAPS,     enc_server_caps, dec_server_caps),
+        PROC(SERVER_CAPS,       enc_server_caps,        dec_server_caps),
-  PROC(DELEGRETURN,     enc_delegreturn, dec_delegreturn),
+        PROC(DELEGRETURN,       enc_delegreturn,        dec_delegreturn),
-  PROC(GETACL,          enc_getacl,     dec_getacl),
+        PROC(GETACL,            enc_getacl,             dec_getacl),
-  PROC(SETACL,          enc_setacl,     dec_setacl),
+        PROC(SETACL,            enc_setacl,             dec_setacl),
-  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
+        PROC(FS_LOCATIONS,      enc_fs_locations,       dec_fs_locations),
-  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
+        PROC(RELEASE_LOCKOWNER, enc_release_lockowner,  dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
-  PROC(EXCHANGE_ID,     enc_exchange_id,        dec_exchange_id),
+        PROC(EXCHANGE_ID,       enc_exchange_id,        dec_exchange_id),
-  PROC(CREATE_SESSION,  enc_create_session,     dec_create_session),
+        PROC(CREATE_SESSION,    enc_create_session,     dec_create_session),
-  PROC(DESTROY_SESSION, enc_destroy_session,    dec_destroy_session),
+        PROC(DESTROY_SESSION,   enc_destroy_session,    dec_destroy_session),
-  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
+        PROC(SEQUENCE,          enc_sequence,           dec_sequence),
-  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
+        PROC(GET_LEASE_TIME,    enc_get_lease_time,     dec_get_lease_time),
-  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+        PROC(RECLAIM_COMPLETE,  enc_reclaim_complete,   dec_reclaim_complete),
-  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+        PROC(GETDEVICEINFO,     enc_getdeviceinfo,      dec_getdeviceinfo),
-  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
+        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b68536cc9046..e1164e3f9e69 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep;
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
-        struct nfs_page *p;
+        struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
-        p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
+        if (p)
-        if (p) {
-                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->wb_list);
-        }
        return p;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db773428f95f..bc4089769735 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,105 +177,149 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 * pNFS client layout cache
 */
+/* Need to hold i_lock if caller does not already hold reference */
+void
+get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        atomic_inc(&lo->plh_refcount);
+}
 static void
-get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        assert_spin_locked(&lo->inode->i_lock);
+        dprintk("%s: freeing layout cache %p\n", __func__, lo);
-        lo->refcount++;
+        BUG_ON(!list_empty(&lo->plh_layouts));
+        NFS_I(lo->plh_inode)->layout = NULL;
+        kfree(lo);
 }
 static void
 put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 {
-        assert_spin_locked(&lo->inode->i_lock);
+        if (atomic_dec_and_test(&lo->plh_refcount))
-        BUG_ON(lo->refcount == 0);
+                destroy_layout_hdr(lo);
-        lo->refcount--;
-        if (!lo->refcount) {
-                dprintk("%s: freeing layout cache %p\n", __func__, lo);
-                BUG_ON(!list_empty(&lo->layouts));
-                NFS_I(lo->inode)->layout = NULL;
-                kfree(lo);
-        }
 }
 void
-put_layout_hdr(struct inode *inode)
+put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        spin_lock(&inode->i_lock);
+        struct inode *inode = lo->plh_inode;
-        put_layout_hdr_locked(NFS_I(inode)->layout);
-        spin_unlock(&inode->i_lock);
+        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+                destroy_layout_hdr(lo);
+                spin_unlock(&inode->i_lock);
+        }
 }
 static void
 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 {
-        INIT_LIST_HEAD(&lseg->fi_list);
+        INIT_LIST_HEAD(&lseg->pls_list);
-        kref_init(&lseg->kref);
+        atomic_set(&lseg->pls_refcount, 1);
-        lseg->layout = lo;
+        smp_mb();
+        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+        lseg->pls_layout = lo;
 }
-/* Called without i_lock held, as the free_lseg call may sleep */
+static void free_lseg(struct pnfs_layout_segment *lseg)
-static void
-destroy_lseg(struct kref *kref)
 {
-        struct pnfs_layout_segment *lseg =
+        struct inode *ino = lseg->pls_layout->plh_inode;
-                container_of(kref, struct pnfs_layout_segment, kref);
-        struct inode *ino = lseg->layout->inode;
-        dprintk("--> %s\n", __func__);
        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
-        /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+        /* Matched by get_layout_hdr in pnfs_insert_layout */
-        put_layout_hdr(ino);
+        put_layout_hdr(NFS_I(ino)->layout);
 }
-static void
+/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
-put_lseg(struct pnfs_layout_segment *lseg)
+ * could sleep, so must be called outside of the lock.
+ * Returns 1 if object was removed, otherwise return 0.
+ */
+static int
+put_lseg_locked(struct pnfs_layout_segment *lseg,
+                struct list_head *tmp_list)
+{
+        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+                atomic_read(&lseg->pls_refcount),
+                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+        if (atomic_dec_and_test(&lseg->pls_refcount)) {
+                struct inode *ino = lseg->pls_layout->plh_inode;
+                BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+                list_del(&lseg->pls_list);
+                if (list_empty(&lseg->pls_layout->plh_segs)) {
+                        struct nfs_client *clp;
+                        clp = NFS_SERVER(ino)->nfs_client;
+                        spin_lock(&clp->cl_lock);
+                        /* List does not take a reference, so no need for put here */
+                        list_del_init(&lseg->pls_layout->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
+                }
+                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+                list_add(&lseg->pls_list, tmp_list);
+                return 1;
+        }
+        return 0;
+}
+static bool
+should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
 {
-        if (!lseg)
+        return (recall_iomode == IOMODE_ANY ||
-                return;
+                lseg_iomode == recall_iomode);
+}
-        dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+/* Returns 1 if lseg is removed from list, 0 otherwise */
-                atomic_read(&lseg->kref.refcount));
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
-        kref_put(&lseg->kref, destroy_lseg);
+                             struct list_head *tmp_list)
+{
+        int rv = 0;
+        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+                /* Remove the reference keeping the lseg in the
+                 * list.  It will now be removed when all
+                 * outstanding io is finished.
+                 */
+                rv = put_lseg_locked(lseg, tmp_list);
+        }
+        return rv;
 }
-static void
+/* Returns count of number of matching invalid lsegs remaining in list
-pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+ * after call.
+ */
+int
+mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                            struct list_head *tmp_list,
+                            u32 iomode)
 {
        struct pnfs_layout_segment *lseg, *next;
-        struct nfs_client *clp;
+        int invalid = 0, removed = 0;
        dprintk("%s:Begin lo %p\n", __func__, lo);
-        assert_spin_locked(&lo->inode->i_lock);
+        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-        list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
-                dprintk("%s: freeing lseg %p\n", __func__, lseg);
+                        dprintk("%s: freeing lseg %p iomode %d "
-                list_move(&lseg->fi_list, tmp_list);
+                                "offset %llu length %llu\n", __func__,
-        }
+                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
-        clp = NFS_SERVER(lo->inode)->nfs_client;
+                                lseg->pls_range.length);
-        spin_lock(&clp->cl_lock);
+                        invalid++;
-        /* List does not take a reference, so no need for put here */
+                        removed += mark_lseg_invalid(lseg, tmp_list);
-        list_del_init(&lo->layouts);
+                }
-        spin_unlock(&clp->cl_lock);
+        dprintk("%s:Return %i\n", __func__, invalid - removed);
-        write_seqlock(&lo->seqlock);
+        return invalid - removed;
-        clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-        write_sequnlock(&lo->seqlock);
-        dprintk("%s:Return\n", __func__);
 }
-static void
+void
-pnfs_free_lseg_list(struct list_head *tmp_list)
+pnfs_free_lseg_list(struct list_head *free_me)
 {
-        struct pnfs_layout_segment *lseg;
+        struct pnfs_layout_segment *lseg, *tmp;
-        while (!list_empty(tmp_list)) {
+        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
-                lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
+                list_del(&lseg->pls_list);
-                                fi_list);
+                free_lseg(lseg);
-                dprintk("%s calling put_lseg on %p\n", __func__, lseg);
-                list_del(&lseg->fi_list);
-                put_lseg(lseg);
        }
 }
@@ -288,7 +332,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        spin_lock(&nfsi->vfs_inode.i_lock);
        lo = nfsi->layout;
        if (lo) {
-                pnfs_clear_lseg_list(lo, &tmp_list);
+                set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
                put_layout_hdr_locked(lo);
        }
@@ -312,76 +357,80 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        while (!list_empty(&tmp_list)) {
                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
-                                layouts);
+                                plh_layouts);
                dprintk("%s freeing layout for inode %lu\n", __func__,
-                        lo->inode->i_ino);
+                        lo->plh_inode->i_ino);
-                pnfs_destroy_layout(NFS_I(lo->inode));
+                pnfs_destroy_layout(NFS_I(lo->plh_inode));
        }
 }
-/* update lo->stateid with new if is more recent
+/* update lo->plh_stateid with new if is more recent */
- *
+void
- * lo->stateid could be the open stateid, in which case we just use what given.
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
- */
+                        bool update_barrier)
-static void
+{
-pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+        u32 oldseq, newseq;
-                        const nfs4_stateid *new)
-{
+        oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
-        nfs4_stateid *old = &lo->stateid;
+        newseq = be32_to_cpu(new->stateid.seqid);
-        bool overwrite = false;
+        if ((int)(newseq - oldseq) > 0) {
+                memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
-        write_seqlock(&lo->seqlock);
+                if (update_barrier) {
-        if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+                        u32 new_barrier = be32_to_cpu(new->stateid.seqid);
-            memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
-                overwrite = true;
+                        if ((int)(new_barrier - lo->plh_barrier))
-        else {
+                                lo->plh_barrier = new_barrier;
-                u32 oldseq, newseq;
+                } else {
+                        /* Because of wraparound, we want to keep the barrier
-                oldseq = be32_to_cpu(old->stateid.seqid);
+                         * "close" to the current seqids.  It needs to be
-                newseq = be32_to_cpu(new->stateid.seqid);
+                         * within 2**31 to count as "behind", so if it
-                if ((int)(newseq - oldseq) > 0)
+                         * gets too near that limit, give us a litle leeway
-                        overwrite = true;
+                         * and bring it to within 2**30.
+                         * NOTE - and yes, this is all unsigned arithmetic.
+                         */
+                        if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+                                lo->plh_barrier = newseq - (1 << 30);
+                }
        }
-        if (overwrite)
-                memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
-        write_sequnlock(&lo->seqlock);
 }
-static void
+/* lget is set to 1 if called from inside send_layoutget call chain */
-pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
+static bool
-                              struct nfs4_state *state)
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+                        int lget)
 {
-        int seq;
+        if ((stateid) &&
+            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
-        dprintk("--> %s\n", __func__);
+                return true;
-        write_seqlock(&lo->seqlock);
+        return lo->plh_block_lgets ||
-        do {
+                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-                seq = read_seqbegin(&state->seqlock);
+                (list_empty(&lo->plh_segs) &&
-                memcpy(lo->stateid.data, state->stateid.data,
+                 (atomic_read(&lo->plh_outstanding) > lget));
-                       sizeof(state->stateid.data));
-        } while (read_seqretry(&state->seqlock, seq));
-        set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-        write_sequnlock(&lo->seqlock);
-        dprintk("<-- %s\n", __func__);
 }
-void
+int
-pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-                        struct nfs4_state *open_state)
+                              struct nfs4_state *open_state)
 {
-        int seq;
+        int status = 0;
        dprintk("--> %s\n", __func__);
-        do {
+        spin_lock(&lo->plh_inode->i_lock);
-                seq = read_seqbegin(&lo->seqlock);
+        if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
-                if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+                status = -EAGAIN;
-                        /* This will trigger retry of the read */
+        } else if (list_empty(&lo->plh_segs)) {
-                        pnfs_layout_from_open_stateid(lo, open_state);
+                int seq;
-                } else
-                        memcpy(dst->data, lo->stateid.data,
+                do {
-                               sizeof(lo->stateid.data));
+                        seq = read_seqbegin(&open_state->seqlock);
-        } while (read_seqretry(&lo->seqlock, seq));
+                        memcpy(dst->data, open_state->stateid.data,
+                               sizeof(open_state->stateid.data));
+                } while (read_seqretry(&open_state->seqlock, seq));
+        } else
+                memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+        spin_unlock(&lo->plh_inode->i_lock);
        dprintk("<-- %s\n", __func__);
+        return status;
 }
 /*
@@ -395,7 +444,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
           struct nfs_open_context *ctx,
           u32 iomode)
 {
-        struct inode *ino = lo->inode;
+        struct inode *ino = lo->plh_inode;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
        struct pnfs_layout_segment *lseg = NULL;
@@ -404,10 +453,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        BUG_ON(ctx == NULL);
        lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
-        if (lgp == NULL) {
+        if (lgp == NULL)
-                put_layout_hdr(lo->inode);
                return NULL;
-        }
        lgp->args.minlength = NFS4_MAX_UINT64;
        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
        lgp->args.range.iomode = iomode;
@@ -424,11 +471,88 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        nfs4_proc_layoutget(lgp);
        if (!lseg) {
                /* remember that LAYOUTGET failed and suspend trying */
-                set_bit(lo_fail_bit(iomode), &lo->state);
+                set_bit(lo_fail_bit(iomode), &lo->plh_flags);
        }
        return lseg;
 }
+bool pnfs_roc(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        struct pnfs_layout_segment *lseg, *tmp;
+        LIST_HEAD(tmp_list);
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+                goto out_nolayout;
+        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        mark_lseg_invalid(lseg, &tmp_list);
+                        found = true;
+                }
+        if (!found)
+                goto out_nolayout;
+        lo->plh_block_lgets++;
+        get_layout_hdr(lo); /* matched in pnfs_roc_release */
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&tmp_list);
+        return true;
+out_nolayout:
+        spin_unlock(&ino->i_lock);
+        return false;
+}
+void pnfs_roc_release(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        lo->plh_block_lgets--;
+        put_layout_hdr_locked(lo);
+        spin_unlock(&ino->i_lock);
+}
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+        struct pnfs_layout_hdr *lo;
+        spin_lock(&ino->i_lock);
+        lo = NFS_I(ino)->layout;
+        if ((int)(barrier - lo->plh_barrier) > 0)
+                lo->plh_barrier = barrier;
+        spin_unlock(&ino->i_lock);
+}
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_segment *lseg;
+        bool found = false;
+        spin_lock(&ino->i_lock);
+        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        found = true;
+                        break;
+                }
+        if (!found) {
+                struct pnfs_layout_hdr *lo = nfsi->layout;
+                u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+                /* Since close does not return a layout stateid for use as
+                 * a barrier, we choose the worst-case barrier.
+                 */
+                *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+        }
+        spin_unlock(&ino->i_lock);
+        return found;
+}
 /*
 * Compare two layout segments for sorting into layout cache.
 * We want to preferentially return RW over RO layouts, so ensure those
@@ -450,37 +574,29 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->inode->i_lock);
+        assert_spin_locked(&lo->plh_inode->i_lock);
-        if (list_empty(&lo->segs)) {
+        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-                struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+                if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
-                spin_lock(&clp->cl_lock);
-                BUG_ON(!list_empty(&lo->layouts));
-                list_add_tail(&lo->layouts, &clp->cl_layouts);
-                spin_unlock(&clp->cl_lock);
-        }
-        list_for_each_entry(lp, &lo->segs, fi_list) {
-                if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
                        continue;
-                list_add_tail(&lseg->fi_list, &lp->fi_list);
+                list_add_tail(&lseg->pls_list, &lp->pls_list);
                dprintk("%s: inserted lseg %p "
                        "iomode %d offset %llu length %llu before "
                        "lp %p iomode %d offset %llu length %llu\n",
-                        __func__, lseg, lseg->range.iomode,
+                        __func__, lseg, lseg->pls_range.iomode,
-                        lseg->range.offset, lseg->range.length,
+                        lseg->pls_range.offset, lseg->pls_range.length,
-                        lp, lp->range.iomode, lp->range.offset,
+                        lp, lp->pls_range.iomode, lp->pls_range.offset,
-                        lp->range.length);
+                        lp->pls_range.length);
                found = 1;
                break;
        }
        if (!found) {
-                list_add_tail(&lseg->fi_list, &lo->segs);
+                list_add_tail(&lseg->pls_list, &lo->plh_segs);
                dprintk("%s: inserted lseg %p "
                        "iomode %d offset %llu length %llu at tail\n",
-                        __func__, lseg, lseg->range.iomode,
+                        __func__, lseg, lseg->pls_range.iomode,
-                        lseg->range.offset, lseg->range.length);
+                        lseg->pls_range.offset, lseg->pls_range.length);
        }
-        get_layout_hdr_locked(lo);
+        get_layout_hdr(lo);
        dprintk("%s:Return\n", __func__);
 }
@@ -493,11 +609,11 @@ alloc_init_layout_hdr(struct inode *ino)
        lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
        if (!lo)
                return NULL;
-        lo->refcount = 1;
+        atomic_set(&lo->plh_refcount, 1);
-        INIT_LIST_HEAD(&lo->layouts);
+        INIT_LIST_HEAD(&lo->plh_layouts);
-        INIT_LIST_HEAD(&lo->segs);
+        INIT_LIST_HEAD(&lo->plh_segs);
-        seqlock_init(&lo->seqlock);
+        INIT_LIST_HEAD(&lo->plh_bulk_recall);
-        lo->inode = ino;
+        lo->plh_inode = ino;
        return lo;
 }
@@ -510,9 +626,12 @@ pnfs_find_alloc_layout(struct inode *ino)
        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
        assert_spin_locked(&ino->i_lock);
-        if (nfsi->layout)
+        if (nfsi->layout) {
-                return nfsi->layout;
+                if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+                        return NULL;
+                else
+                        return nfsi->layout;
+        }
        spin_unlock(&ino->i_lock);
        new = alloc_init_layout_hdr(ino);
        spin_lock(&ino->i_lock);
@@ -538,31 +657,32 @@ pnfs_find_alloc_layout(struct inode *ino)
 static int
 is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
 {
-        return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+        return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
 }
 /*
 * lookup range in layout
 */
 static struct pnfs_layout_segment *
-pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 {
        struct pnfs_layout_segment *lseg, *ret = NULL;
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->inode->i_lock);
+        assert_spin_locked(&lo->plh_inode->i_lock);
-        list_for_each_entry(lseg, &lo->segs, fi_list) {
+        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
-                if (is_matching_lseg(lseg, iomode)) {
+                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+                    is_matching_lseg(lseg, iomode)) {
                        ret = lseg;
                        break;
                }
-                if (cmp_layout(iomode, lseg->range.iomode) > 0)
+                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
                        break;
        }
        dprintk("%s:Return lseg %p ref %d\n",
-                __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
        return ret;
 }
@@ -576,6 +696,7 @@ pnfs_update_layout(struct inode *ino,
                   enum pnfs_iomode iomode)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg = NULL;
@@ -588,25 +709,53 @@ pnfs_update_layout(struct inode *ino,
                goto out_unlock;
        }
-        /* Check to see if the layout for the given range already exists */
+        /* Do we even need to bother with this? */
-        lseg = pnfs_has_layout(lo, iomode);
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
-        if (lseg) {
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-                dprintk("%s: Using cached lseg %p for iomode %d)\n",
+                dprintk("%s matches recall, use MDS\n", __func__);
-                        __func__, lseg, iomode);
                goto out_unlock;
        }
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_find_lseg(lo, iomode);
+        if (lseg)
+                goto out_unlock;
        /* if LAYOUTGET already failed once we don't try again */
-        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+                goto out_unlock;
+        if (pnfs_layoutgets_blocked(lo, NULL, 0))
                goto out_unlock;
+        atomic_inc(&lo->plh_outstanding);
-        get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+        get_layout_hdr(lo);
+        if (list_empty(&lo->plh_segs)) {
+                /* The lo must be on the clp list if there is any
+                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+                 */
+                spin_lock(&clp->cl_lock);
+                BUG_ON(!list_empty(&lo->plh_layouts));
+                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
        spin_unlock(&ino->i_lock);
        lseg = send_layoutget(lo, ctx, iomode);
+        if (!lseg) {
+                spin_lock(&ino->i_lock);
+                if (list_empty(&lo->plh_segs)) {
+                        spin_lock(&clp->cl_lock);
+                        list_del_init(&lo->plh_layouts);
+                        spin_unlock(&clp->cl_lock);
+                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                }
+                spin_unlock(&ino->i_lock);
+        }
+        atomic_dec(&lo->plh_outstanding);
+        put_layout_hdr(lo);
 out:
        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-                nfsi->layout->state, lseg);
+                nfsi->layout->plh_flags, lseg);
        return lseg;
 out_unlock:
        spin_unlock(&ino->i_lock);
@@ -619,9 +768,21 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
        struct nfs4_layoutget_res *res = &lgp->res;
        struct pnfs_layout_segment *lseg;
-        struct inode *ino = lo->inode;
+        struct inode *ino = lo->plh_inode;
+        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        int status = 0;
+        /* Verify we got what we asked for.
+         * Note that because the xdr parsing only accepts a single
+         * element array, this can fail even if the server is behaving
+         * correctly.
+         */
+        if (lgp->args.range.iomode > res->range.iomode ||
+            res->range.offset != 0 ||
+            res->range.length != NFS4_MAX_UINT64) {
+                status = -EINVAL;
+                goto out;
+        }
        /* Inject layout blob into I/O device driver */
        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
        if (!lseg || IS_ERR(lseg)) {
@@ -635,16 +796,37 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        }
        spin_lock(&ino->i_lock);
+        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+                dprintk("%s forget reply due to recall\n", __func__);
+                goto out_forget_reply;
+        }
+        if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+                dprintk("%s forget reply due to state\n", __func__);
+                goto out_forget_reply;
+        }
        init_lseg(lo, lseg);
-        lseg->range = res->range;
+        lseg->pls_range = res->range;
        *lgp->lsegpp = lseg;
        pnfs_insert_layout(lo, lseg);
+        if (res->return_on_close) {
+                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+                set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+        }
        /* Done processing layoutget. Set the layout stateid */
-        pnfs_set_layout_stateid(lo, &res->stateid);
+        pnfs_set_layout_stateid(lo, &res->stateid, false);
        spin_unlock(&ino->i_lock);
 out:
        return status;
+out_forget_reply:
+        spin_unlock(&ino->i_lock);
+        lseg->pls_layout = lo;
+        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+        goto out;
 }
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e12367d50489..e2612ea0cbed 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,11 +30,17 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
+enum {
+        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
+        NFS_LSEG_ROC,           /* roc bit received from server */
+};
 struct pnfs_layout_segment {
-        struct list_head fi_list;
+        struct list_head pls_list;
-        struct pnfs_layout_range range;
+        struct pnfs_layout_range pls_range;
-        struct kref kref;
+        atomic_t pls_refcount;
-        struct pnfs_layout_hdr *layout;
+        unsigned long pls_flags;
+        struct pnfs_layout_hdr *pls_layout;
 };
 #ifdef CONFIG_NFS_V4_1
@@ -44,7 +50,9 @@ struct pnfs_layout_segment {
 enum {
        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
-        NFS_LAYOUT_STATEID_SET,         /* have a valid layout stateid */
+        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
+        NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
+        NFS_LAYOUT_DESTROYED,           /* no new use of layout allowed */
 };
 /* Per-layout driver specific registration structure */
@@ -60,13 +68,16 @@ struct pnfs_layoutdriver_type {
 };
 struct pnfs_layout_hdr {
-        unsigned long           refcount;
+        atomic_t                plh_refcount;
-        struct list_head        layouts;   /* other client layouts */
+        struct list_head        plh_layouts;   /* other client layouts */
-        struct list_head        segs;      /* layout segments list */
+        struct list_head        plh_bulk_recall; /* clnt list of bulk recalls */
-        seqlock_t               seqlock;   /* Protects the stateid */
+        struct list_head        plh_segs;      /* layout segments list */
-        nfs4_stateid            stateid;
+        nfs4_stateid            plh_stateid;
-        unsigned long           state;
+        atomic_t                plh_outstanding; /* number of RPCs out */
-        struct inode            *inode;
+        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
+        u32                     plh_barrier; /* ignore lower seqids */
+        unsigned long           plh_flags;
+        struct inode            *plh_inode;
 };
 struct pnfs_device {
@@ -134,17 +145,30 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 /* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
-void put_layout_hdr(struct inode *inode);
+void put_layout_hdr(struct pnfs_layout_hdr *lo);
-void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
-                             struct nfs4_state *open_state);
+                             const nfs4_stateid *new,
+                             bool update_barrier);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+                                  struct pnfs_layout_hdr *lo,
+                                  struct nfs4_state *open_state);
+int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+                                struct list_head *tmp_list,
+                                u32 iomode);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 static inline int lo_fail_bit(u32 iomode)
@@ -176,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
        return NULL;
 }
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+        return false;
+}
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+        return false;
+}
 static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
 {
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 58e7f84fc1fd..77d5e21c4ad6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -458,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        fattr = nfs_alloc_fattr();
        status = -ENOMEM;
        if (fh == NULL || fattr == NULL)
-                goto out;
+                goto out_free;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
@@ -471,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
        if (status == 0)
                status = nfs_instantiate(dentry, fh, fattr);
+out_free:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fh);
 out:
@@ -731,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .statfs         = nfs_proc_statfs,
        .fsinfo         = nfs_proc_fsinfo,
        .pathconf       = nfs_proc_pathconf,
-        .decode_dirent  = nfs_decode_dirent,
+        .decode_dirent  = nfs2_decode_dirent,
        .read_setup     = nfs_proc_read_setup,
        .read_done      = nfs_read_done,
        .write_setup    = nfs_proc_write_setup,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4100630c9a5b..0f9ea73e7789 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -598,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->mountd_version || showdefaults)
                seq_printf(m, ",mountvers=%u", nfss->mountd_version);
-        if (nfss->mountd_port || showdefaults)
+        if ((nfss->mountd_port &&
+                nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+                showdefaults)
                seq_printf(m, ",mountport=%u", nfss->mountd_port);
        nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -2494,7 +2496,13 @@ static void nfs4_clone_super(struct super_block *sb,
        sb->s_maxbytes = old_sb->s_maxbytes;
        sb->s_time_gran = 1;
        sb->s_op = old_sb->s_op;
-        nfs_initialise_sb(sb);
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr  = old_sb->s_xattr;
+        nfs_initialise_sb(sb);
 }
 /*
@@ -2504,6 +2512,12 @@ static void nfs4_fill_super(struct super_block *sb)
 {
        sb->s_time_gran = 1;
        sb->s_op = &nfs4_sops;
+        /*
+         * The VFS shouldn't apply the umask to mode bits. We will do
+         * so ourselves when necessary.
+         */
+        sb->s_flags  |= MS_POSIXACL;
+        sb->s_xattr = nfs4_xattr_handlers;
        nfs_initialise_sb(sb);
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 8fe9eb47a97f..e313a51acdd1 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -429,7 +429,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
                return ERR_PTR(-ENOMEM);
-        task_setup_data.callback_data = data,
+        task_setup_data.callback_data = data;
        data->cred = rpc_lookup_cred();
        if (IS_ERR(data->cred)) {
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 143da2eecd7b..21a63da305ff 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -50,11 +50,6 @@ enum {
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
-enum nfs_cb_opnum4 {
-        OP_CB_RECALL            = 4,
-        OP_CB_SEQUENCE          = 11,
-};
 #define NFS4_MAXTAGLEN          20
 #define NFS4_enc_cb_null_sz             0
@@ -79,61 +74,6 @@ enum nfs_cb_opnum4 {
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
-/*
-* Generic encode routines from fs/nfs/nfs4xdr.c
-*/
-static inline __be32 *
-xdr_writemem(__be32 *p, const void *ptr, int nbytes)
-{
-        int tmp = XDR_QUADLEN(nbytes);
-        if (!tmp)
-                return p;
-        p[tmp-1] = 0;
-        memcpy(p, ptr, nbytes);
-        return p + tmp;
-}
-#define WRITE32(n)               *p++ = htonl(n)
-#define WRITEMEM(ptr,nbytes)     do {                           \
-        p = xdr_writemem(p, ptr, nbytes);                       \
-} while (0)
-#define RESERVE_SPACE(nbytes)   do {                            \
-        p = xdr_reserve_space(xdr, nbytes);                     \
-        if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
-        BUG_ON(!p);                                             \
-} while (0)
-/*
- * Generic decode routines from fs/nfs/nfs4xdr.c
- */
-#define DECODE_TAIL                             \
-        status = 0;                             \
-out:                                            \
-        return status;                          \
-xdr_error:                                      \
-        dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
-        status = -EIO;                          \
-        goto out
-#define READ32(x)         (x) = ntohl(*p++)
-#define READ64(x)         do {                  \
-        (x) = (u64)ntohl(*p++) << 32;           \
-        (x) |= ntohl(*p++);                     \
-} while (0)
-#define READTIME(x)       do {                  \
-        p++;                                    \
-        (x.tv_sec) = ntohl(*p++);               \
-        (x.tv_nsec) = ntohl(*p++);              \
-} while (0)
-#define READ_BUF(nbytes)  do { \
-        p = xdr_inline_decode(xdr, nbytes); \
-        if (!p) { \
-                dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
-                        __func__, __LINE__); \
-                return -EIO; \
-        } \
-} while (0)
 struct nfs4_cb_compound_hdr {
        /* args */
        u32             ident;  /* minorversion 0 only */
@@ -144,295 +84,513 @@ struct nfs4_cb_compound_hdr {
        int             status;
 };
-static struct {
+/*
-int stat;
+ * Handle decode buffer overflows out-of-line.
-int errno;
+ */
-} nfs_cb_errtbl[] = {
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-        { NFS4_OK,              0               },
+{
-        { NFS4ERR_PERM,         EPERM           },
+        dprintk("NFS: %s prematurely hit the end of our receive buffer. "
-        { NFS4ERR_NOENT,        ENOENT          },
+                "Remaining buffer length is %tu words.\n",
-        { NFS4ERR_IO,           EIO             },
+                func, xdr->end - xdr->p);
-        { NFS4ERR_NXIO,         ENXIO           },
+}
-        { NFS4ERR_ACCESS,       EACCES          },
-        { NFS4ERR_EXIST,        EEXIST          },
-        { NFS4ERR_XDEV,         EXDEV           },
-        { NFS4ERR_NOTDIR,       ENOTDIR         },
-        { NFS4ERR_ISDIR,        EISDIR          },
-        { NFS4ERR_INVAL,        EINVAL          },
-        { NFS4ERR_FBIG,         EFBIG           },
-        { NFS4ERR_NOSPC,        ENOSPC          },
-        { NFS4ERR_ROFS,         EROFS           },
-        { NFS4ERR_MLINK,        EMLINK          },
-        { NFS4ERR_NAMETOOLONG,  ENAMETOOLONG    },
-        { NFS4ERR_NOTEMPTY,     ENOTEMPTY       },
-        { NFS4ERR_DQUOT,        EDQUOT          },
-        { NFS4ERR_STALE,        ESTALE          },
-        { NFS4ERR_BADHANDLE,    EBADHANDLE      },
-        { NFS4ERR_BAD_COOKIE,   EBADCOOKIE      },
-        { NFS4ERR_NOTSUPP,      ENOTSUPP        },
-        { NFS4ERR_TOOSMALL,     ETOOSMALL       },
-        { NFS4ERR_SERVERFAULT,  ESERVERFAULT    },
-        { NFS4ERR_BADTYPE,      EBADTYPE        },
-        { NFS4ERR_LOCKED,       EAGAIN          },
-        { NFS4ERR_RESOURCE,     EREMOTEIO       },
-        { NFS4ERR_SYMLINK,      ELOOP           },
-        { NFS4ERR_OP_ILLEGAL,   EOPNOTSUPP      },
-        { NFS4ERR_DEADLOCK,     EDEADLK         },
-        { -1,                   EIO             }
-};
-static int
+static __be32 *xdr_encode_empty_array(__be32 *p)
-nfs_cb_stat_to_errno(int stat)
 {
-        int i;
+        *p++ = xdr_zero;
-        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+        return p;
-                if (nfs_cb_errtbl[i].stat == stat)
-                        return nfs_cb_errtbl[i].errno;
-        }
-        /* If we cannot translate the error, the recovery routines should
-        * handle it.
-        * Note: remaining NFSv4 error codes have values > 10000, so should
-        * not conflict with native Linux error codes.
-        */
-        return stat;
 }
 /*
- * XDR encode
+ * Encode/decode NFSv4 CB basic data types
+ *
+ * Basic NFSv4 callback data types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section
+ * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
+ * 1 Protocol"
+ */
+/*
+ *      nfs_cb_opnum4
+ *
+ *      enum nfs_cb_opnum4 {
+ *              OP_CB_GETATTR           = 3,
+ *                ...
+ *      };
 */
+enum nfs_cb_opnum4 {
+        OP_CB_GETATTR                   = 3,
+        OP_CB_RECALL                    = 4,
+        OP_CB_LAYOUTRECALL              = 5,
+        OP_CB_NOTIFY                    = 6,
+        OP_CB_PUSH_DELEG                = 7,
+        OP_CB_RECALL_ANY                = 8,
+        OP_CB_RECALLABLE_OBJ_AVAIL      = 9,
+        OP_CB_RECALL_SLOT               = 10,
+        OP_CB_SEQUENCE                  = 11,
+        OP_CB_WANTS_CANCELLED           = 12,
+        OP_CB_NOTIFY_LOCK               = 13,
+        OP_CB_NOTIFY_DEVICEID           = 14,
+        OP_CB_ILLEGAL                   = 10044
+};
-static void
+static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
-encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
 {
        __be32 *p;
-        RESERVE_SPACE(sizeof(stateid_t));
+        p = xdr_reserve_space(xdr, 4);
-        WRITE32(sid->si_generation);
+        *p = cpu_to_be32(op);
-        WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
 }
-static void
+/*
-encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
+ * nfs_fh4
+ *
+ *      typedef opaque nfs_fh4<NFS4_FHSIZE>;
+ */
+static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
 {
-        __be32 * p;
+        u32 length = fh->fh_size;
+        __be32 *p;
-        RESERVE_SPACE(16);
+        BUG_ON(length > NFS4_FHSIZE);
-        WRITE32(0);            /* tag length is always 0 */
+        p = xdr_reserve_space(xdr, 4 + length);
-        WRITE32(hdr->minorversion);
+        xdr_encode_opaque(p, &fh->fh_base, length);
-        WRITE32(hdr->ident);
-        hdr->nops_p = p;
-        WRITE32(hdr->nops);
 }
-static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+/*
+ * stateid4
+ *
+ *      struct stateid4 {
+ *              uint32_t        seqid;
+ *              opaque          other[12];
+ *      };
+ */
+static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
 {
-        *hdr->nops_p = htonl(hdr->nops);
+        __be32 *p;
+        p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(sid->si_generation);
+        xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
 }
-static void
+/*
-encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+ * sessionid4
-                struct nfs4_cb_compound_hdr *hdr)
+ *
+ *      typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
+ */
+static void encode_sessionid4(struct xdr_stream *xdr,
+                              const struct nfsd4_session *session)
 {
        __be32 *p;
-        int len = dp->dl_fh.fh_size;
+        p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
-        RESERVE_SPACE(4);
+        xdr_encode_opaque_fixed(p, session->se_sessionid.data,
-        WRITE32(OP_CB_RECALL);
+                                        NFS4_MAX_SESSIONID_LEN);
-        encode_stateid(xdr, &dp->dl_stateid);
-        RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
-        WRITE32(0); /* truncate optimization not implemented */
-        WRITE32(len);
-        WRITEMEM(&dp->dl_fh.fh_base, len);
-        hdr->nops++;
 }
-static void
+/*
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
+ * nfsstat4
-                   struct nfs4_cb_compound_hdr *hdr)
+ */
-{
+static const struct {
-        __be32 *p;
+        int stat;
-        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+        int errno;
+} nfs_cb_errtbl[] = {
+        { NFS4_OK,              0               },
+        { NFS4ERR_PERM,         -EPERM          },
+        { NFS4ERR_NOENT,        -ENOENT         },
+        { NFS4ERR_IO,           -EIO            },
+        { NFS4ERR_NXIO,         -ENXIO          },
+        { NFS4ERR_ACCESS,       -EACCES         },
+        { NFS4ERR_EXIST,        -EEXIST         },
+        { NFS4ERR_XDEV,         -EXDEV          },
+        { NFS4ERR_NOTDIR,       -ENOTDIR        },
+        { NFS4ERR_ISDIR,        -EISDIR         },
+        { NFS4ERR_INVAL,        -EINVAL         },
+        { NFS4ERR_FBIG,         -EFBIG          },
+        { NFS4ERR_NOSPC,        -ENOSPC         },
+        { NFS4ERR_ROFS,         -EROFS          },
+        { NFS4ERR_MLINK,        -EMLINK         },
+        { NFS4ERR_NAMETOOLONG,  -ENAMETOOLONG   },
+        { NFS4ERR_NOTEMPTY,     -ENOTEMPTY      },
+        { NFS4ERR_DQUOT,        -EDQUOT         },
+        { NFS4ERR_STALE,        -ESTALE         },
+        { NFS4ERR_BADHANDLE,    -EBADHANDLE     },
+        { NFS4ERR_BAD_COOKIE,   -EBADCOOKIE     },
+        { NFS4ERR_NOTSUPP,      -ENOTSUPP       },
+        { NFS4ERR_TOOSMALL,     -ETOOSMALL      },
+        { NFS4ERR_SERVERFAULT,  -ESERVERFAULT   },
+        { NFS4ERR_BADTYPE,      -EBADTYPE       },
+        { NFS4ERR_LOCKED,       -EAGAIN         },
+        { NFS4ERR_RESOURCE,     -EREMOTEIO      },
+        { NFS4ERR_SYMLINK,      -ELOOP          },
+        { NFS4ERR_OP_ILLEGAL,   -EOPNOTSUPP     },
+        { NFS4ERR_DEADLOCK,     -EDEADLK        },
+        { -1,                   -EIO            }
+};
-        if (hdr->minorversion == 0)
+/*
-                return;
+ * If we cannot translate the error, the recovery routines should
+ * handle it.
+ *
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+static int nfs_cb_stat_to_errno(int status)
+{
+        int i;
-        RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
+        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+                if (nfs_cb_errtbl[i].stat == status)
+                        return nfs_cb_errtbl[i].errno;
+        }
-        WRITE32(OP_CB_SEQUENCE);
+        dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
-        WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        return -status;
-        WRITE32(ses->se_cb_seq_nr);
-        WRITE32(0);             /* slotid, always 0 */
-        WRITE32(0);             /* highest slotid always 0 */
-        WRITE32(0);             /* cachethis always 0 */
-        WRITE32(0); /* FIXME: support referring_call_lists */
-        hdr->nops++;
 }
-static int
+static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
-nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
+                               enum nfsstat4 *status)
 {
-        struct xdr_stream xdrs, *xdr = &xdrs;
+        __be32 *p;
+        u32 op;
-        xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
+        p = xdr_inline_decode(xdr, 4 + 4);
-        RESERVE_SPACE(0);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        op = be32_to_cpup(p++);
+        if (unlikely(op != expected))
+                goto out_unexpected;
+        *status = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+out_unexpected:
+        dprintk("NFSD: Callback server returned operation %d but "
+                "we issued a request for %d\n", op, expected);
+        return -EIO;
 }
-static int
+/*
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
+ * CB_COMPOUND4args
-                struct nfsd4_callback *cb)
+ *
+ *      struct CB_COMPOUND4args {
+ *              utf8str_cs      tag;
+ *              uint32_t        minorversion;
+ *              uint32_t        callback_ident;
+ *              nfs_cb_argop4   argarray<>;
+ *      };
+*/
+static void encode_cb_compound4args(struct xdr_stream *xdr,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
-        struct xdr_stream xdr;
+        __be32 * p;
-        struct nfs4_delegation *args = cb->cb_op;
-        struct nfs4_cb_compound_hdr hdr = {
-                .ident = cb->cb_clp->cl_cb_ident,
-                .minorversion = cb->cb_minorversion,
-        };
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
-        encode_cb_compound_hdr(&xdr, &hdr);
+        p = xdr_encode_empty_array(p);          /* empty tag */
-        encode_cb_sequence(&xdr, cb, &hdr);
+        *p++ = cpu_to_be32(hdr->minorversion);
-        encode_cb_recall(&xdr, args, &hdr);
+        *p++ = cpu_to_be32(hdr->ident);
-        encode_cb_nops(&hdr);
+        hdr->nops_p = p;
+        *p = cpu_to_be32(hdr->nops);            /* argarray element count */
+}
+/*
+ * Update argarray element count
+ */
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+        BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
+        *hdr->nops_p = cpu_to_be32(hdr->nops);
+}
+/*
+ * CB_COMPOUND4res
+ *
+ *      struct CB_COMPOUND4res {
+ *              nfsstat4        status;
+ *              utf8str_cs      tag;
+ *              nfs_cb_resop4   resarray<>;
+ *      };
+ */
+static int decode_cb_compound4res(struct xdr_stream *xdr,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        u32 length;
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4 + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->status = be32_to_cpup(p++);
+        /* Ignore the tag */
+        length = be32_to_cpup(p++);
+        p = xdr_inline_decode(xdr, length + 4);
+        if (unlikely(p == NULL))
+                goto out_overflow;
+        hdr->nops = be32_to_cpup(p);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+/*
+ * CB_RECALL4args
+ *
+ *      struct CB_RECALL4args {
+ *              stateid4        stateid;
+ *              bool            truncate;
+ *              nfs_fh4         fh;
+ *      };
+ */
+static void encode_cb_recall4args(struct xdr_stream *xdr,
+                                  const struct nfs4_delegation *dp,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        __be32 *p;
+        encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
+        encode_stateid4(xdr, &dp->dl_stateid);
+        p = xdr_reserve_space(xdr, 4);
+        *p++ = xdr_zero;                        /* truncate */
-static int
+        encode_nfs_fh4(xdr, &dp->dl_fh);
-decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
-        __be32 *p;
-        u32 taglen;
-        READ_BUF(8);
+        hdr->nops++;
-        READ32(hdr->status);
-        /* We've got no use for the tag; ignore it: */
-        READ32(taglen);
-        READ_BUF(taglen + 4);
-        p += XDR_QUADLEN(taglen);
-        READ32(hdr->nops);
-        return 0;
 }
-static int
+/*
-decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+ * CB_SEQUENCE4args
+ *
+ *      struct CB_SEQUENCE4args {
+ *              sessionid4              csa_sessionid;
+ *              sequenceid4             csa_sequenceid;
+ *              slotid4                 csa_slotid;
+ *              slotid4                 csa_highest_slotid;
+ *              bool                    csa_cachethis;
+ *              referring_call_list4    csa_referring_call_lists<>;
+ *      };
+ */
+static void encode_cb_sequence4args(struct xdr_stream *xdr,
+                                    const struct nfsd4_callback *cb,
+                                    struct nfs4_cb_compound_hdr *hdr)
 {
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        __be32 *p;
-        u32 op;
-        int32_t nfserr;
+        if (hdr->minorversion == 0)
+                return;
-        READ_BUF(8);
-        READ32(op);
+        encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
-        if (op != expected) {
+        encode_sessionid4(xdr, session);
-                dprintk("NFSD: decode_cb_op_hdr: Callback server returned "
-                         " operation %d but we issued a request for %d\n",
+        p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
-                         op, expected);
+        *p++ = cpu_to_be32(session->se_cb_seq_nr);      /* csa_sequenceid */
-                return -EIO;
+        *p++ = xdr_zero;                        /* csa_slotid */
-        }
+        *p++ = xdr_zero;                        /* csa_highest_slotid */
-        READ32(nfserr);
+        *p++ = xdr_zero;                        /* csa_cachethis */
-        if (nfserr != NFS_OK)
+        xdr_encode_empty_array(p);              /* csa_referring_call_lists */
-                return -nfs_cb_stat_to_errno(nfserr);
-        return 0;
+        hdr->nops++;
 }
 /*
+ * CB_SEQUENCE4resok
+ *
+ *      struct CB_SEQUENCE4resok {
+ *              sessionid4      csr_sessionid;
+ *              sequenceid4     csr_sequenceid;
+ *              slotid4         csr_slotid;
+ *              slotid4         csr_highest_slotid;
+ *              slotid4         csr_target_highest_slotid;
+ *      };
+ *
+ *      union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
+ *      case NFS4_OK:
+ *              CB_SEQUENCE4resok       csr_resok4;
+ *      default:
+ *              void;
+ *      };
+ *
 * Our current back channel implmentation supports a single backchannel
 * with a single slot.
 */
-static int
+static int decode_cb_sequence4resok(struct xdr_stream *xdr,
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
+                                    struct nfsd4_callback *cb)
-                   struct rpc_rqst *rqstp)
 {
-        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
+        struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
        struct nfs4_sessionid id;
        int status;
-        u32 dummy;
        __be32 *p;
+        u32 dummy;
-        if (cb->cb_minorversion == 0)
+        status = -ESERVERFAULT;
-                return 0;
-        status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
-        if (status)
-                return status;
        /*
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        status = -ESERVERFAULT;
+        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+        if (unlikely(p == NULL))
-        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+                goto out_overflow;
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+        if (memcmp(id.data, session->se_sessionid.data,
-        if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
+                                        NFS4_MAX_SESSIONID_LEN) != 0) {
-                dprintk("%s Invalid session id\n", __func__);
+                dprintk("NFS: %s Invalid session id\n", __func__);
                goto out;
        }
-        READ32(dummy);
+        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-        if (dummy != ses->se_cb_seq_nr) {
-                dprintk("%s Invalid sequence number\n", __func__);
+        dummy = be32_to_cpup(p++);
+        if (dummy != session->se_cb_seq_nr) {
+                dprintk("NFS: %s Invalid sequence number\n", __func__);
                goto out;
        }
-        READ32(dummy);  /* slotid must be 0 */
+        dummy = be32_to_cpup(p++);
        if (dummy != 0) {
-                dprintk("%s Invalid slotid\n", __func__);
+                dprintk("NFS: %s Invalid slotid\n", __func__);
                goto out;
        }
-        /* FIXME: process highest slotid and target highest slotid */
+        /*
+         * FIXME: process highest slotid and target highest slotid
+         */
        status = 0;
 out:
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
+static int decode_cb_sequence4res(struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
+{
+        enum nfsstat4 nfserr;
+        int status;
+        if (cb->cb_minorversion == 0)
+                return 0;
+        status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
+        status = decode_cb_sequence4resok(xdr, cb);
+out:
+        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
+}
-static int
+/*
-nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
+ * NFSv4.0 and NFSv4.1 XDR encode functions
+ *
+ * NFSv4.0 callback argument types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+/*
+ * NB: Without this zero space reservation, callbacks over krb5p fail
+ */
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                 void *__unused)
+{
+        xdr_reserve_space(xdr, 0);
+}
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                   const struct nfsd4_callback *cb)
+{
+        const struct nfs4_delegation *args = cb->cb_op;
+        struct nfs4_cb_compound_hdr hdr = {
+                .ident = cb->cb_clp->cl_cb_ident,
+                .minorversion = cb->cb_minorversion,
+        };
+        encode_cb_compound4args(xdr, &hdr);
+        encode_cb_sequence4args(xdr, cb, &hdr);
+        encode_cb_recall4args(xdr, args, &hdr);
+        encode_cb_nops(&hdr);
+}
+/*
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+ *
+ * NFSv4.0 callback result types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661:  "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+                                void *__unused)
 {
        return 0;
 }
-static int
+/*
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
-                struct nfsd4_callback *cb)
+ */
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
 {
-        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr;
+        enum nfsstat4 nfserr;
        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_cb_compound4res(xdr, &hdr);
-        status = decode_cb_compound_hdr(&xdr, &hdr);
+        if (unlikely(status))
-        if (status)
                goto out;
-        if (cb) {
-                status = decode_cb_sequence(&xdr, cb, rqstp);
+        if (cb != NULL) {
-                if (status)
+                status = decode_cb_sequence4res(xdr, cb);
+                if (unlikely(status))
                        goto out;
        }
-        status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
+        status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                goto out_default;
 out:
        return status;
+out_default:
+        return nfs_cb_stat_to_errno(status);
 }
 /*
 * RPC procedure tables
 */
-#define PROC(proc, call, argtype, restype)                              \
+#define PROC(proc, call, argtype, restype)                              \
-[NFSPROC4_CLNT_##proc] = {                                              \
+[NFSPROC4_CLNT_##proc] = {                                              \
-        .p_proc   = NFSPROC4_CB_##call,                                 \
+        .p_proc    = NFSPROC4_CB_##call,                                \
-        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
+        .p_encode  = (kxdreproc_t)nfs4_xdr_enc_##argtype,               \
-        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
+        .p_decode  = (kxdrdproc_t)nfs4_xdr_dec_##restype,               \
-        .p_arglen = NFS4_##argtype##_sz,                                \
+        .p_arglen  = NFS4_enc_##argtype##_sz,                           \
-        .p_replen = NFS4_##restype##_sz,                                \
+        .p_replen  = NFS4_dec_##restype##_sz,                           \
-        .p_statidx = NFSPROC4_CB_##call,                                \
+        .p_statidx = NFSPROC4_CB_##call,                                \
-        .p_name   = #proc,                                              \
+        .p_name    = #proc,                                             \
-}
+}
-static struct rpc_procinfo     nfs4_cb_procedures[] = {
+static struct rpc_procinfo nfs4_cb_procedures[] = {
-    PROC(CB_NULL,      NULL,     enc_cb_null,     dec_cb_null),
+        PROC(CB_NULL,   NULL,           cb_null,        cb_null),
-    PROC(CB_RECALL,    COMPOUND,   enc_cb_recall,      dec_cb_recall),
+        PROC(CB_RECALL, COMPOUND,       cb_recall,      cb_recall),
 };
-static struct rpc_version       nfs_cb_version4 = {
+static struct rpc_version nfs_cb_version4 = {
 /*
 * Note on the callback rpc program version number: despite language in rfc
 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
@@ -440,29 +598,29 @@ static struct rpc_version       nfs_cb_version4 = {
 * in practice that appears to be what implementations use.  The section
 * 18.36.3 language is expected to be fixed in an erratum.
 */
-        .number                 = 1,
+        .number                 = 1,
-        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
+        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
-        .procs                  = nfs4_cb_procedures
+        .procs                  = nfs4_cb_procedures
 };
-static struct rpc_version *     nfs_cb_version[] = {
+static struct rpc_version *nfs_cb_version[] = {
        &nfs_cb_version4,
 };
 static struct rpc_program cb_program;
 static struct rpc_stat cb_stats = {
-                .program        = &cb_program
+        .program                = &cb_program
 };
 #define NFS4_CALLBACK 0x40000000
 static struct rpc_program cb_program = {
-                .name           = "nfs4_cb",
+        .name                   = "nfs4_cb",
-                .number         = NFS4_CALLBACK,
+        .number                 = NFS4_CALLBACK,
-                .nrvers         = ARRAY_SIZE(nfs_cb_version),
+        .nrvers                 = ARRAY_SIZE(nfs_cb_version),
-                .version        = nfs_cb_version,
+        .version                = nfs_cb_version,
-                .stats          = &cb_stats,
+        .stats                  = &cb_stats,
-                .pipe_dir_name  = "/nfsd4_cb",
+        .pipe_dir_name          = "/nfsd4_cb",
 };
 static int max_cb_time(void)
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 8b782b062baa..3ee67c67cc52 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -35,7 +35,20 @@
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 {
-        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+        return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
+}
+static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
+                                     const char *fname, int err)
+{
+        struct inode *inode = bmap->b_inode;
+        if (err == -EINVAL) {
+                nilfs_error(inode->i_sb, fname,
+                            "broken bmap (inode number=%lu)\n", inode->i_ino);
+                err = -EIO;
+        }
+        return err;
 }
 /**
@@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
-        if (ret < 0)
+        if (ret < 0) {
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
                goto out;
+        }
        if (NILFS_BMAP_USE_VBN(bmap)) {
                ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
                                          &blocknr);
@@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
        up_read(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_insert(bmap, key, rec);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
@@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
-        if (!ret)
-                *key = lastkey;
        up_read(&bmap->b_sem);
+        if (ret < 0)
+                ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+        else
+                *key = lastkey;
        return ret;
 }
@@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_delete(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
@@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
        down_write(&bmap->b_sem);
        ret = nilfs_bmap_do_truncate(bmap, key);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_propagate(bmap, bh);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
@@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        down_write(&bmap->b_sem);
        ret = bmap->b_ops->bop_mark(bmap, key, level);
        up_write(&bmap->b_sem);
-        return ret;
+        return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 /**
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5115814cb745..388e9e8f5286 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -104,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
        if (pblocknr == 0) {
                pblocknr = blocknr;
                if (inode->i_ino != NILFS_DAT_INO) {
-                        struct inode *dat =
+                        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
-                                nilfs_dat_inode(NILFS_I_NILFS(inode));
                        /* blocknr is a virtual block number */
                        err = nilfs_dat_translate(dat, blocknr, &pblocknr);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index cb003c8ee1f6..9d45773b79e6 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page,
                               unsigned from, unsigned to)
 {
        struct inode *dir = mapping->host;
-        struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
        loff_t pos = page_offset(page) + from;
        unsigned len = to - from;
        unsigned nr_dirty, copied;
@@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page,
                i_size_write(dir, pos + copied);
        if (IS_DIRSYNC(dir))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
-        err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+        err = nilfs_set_file_dirty(dir, nr_dirty);
        WARN_ON(err); /* do not happen */
        unlock_page(page);
 }
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c9a30d7ff6fc..2f560c9fb808 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = {
        .truncate       = nilfs_truncate,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 /* end of file */
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 9f8a2da67f90..bfc73d3a30ed 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -149,14 +149,9 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
        }
        err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
-        if (unlikely(err)) {
+        if (unlikely(err))
-                if (err == -EINVAL)
+                nilfs_warning(sb, __func__, "unable to read inode: %lu",
-                        nilfs_error(sb, __func__, "ifile is broken");
+                              (unsigned long) ino);
-                else
-                        nilfs_warning(sb, __func__,
-                                      "unable to read inode: %lu",
-                                      (unsigned long) ino);
-        }
        return err;
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 77b48c8fab17..2fd440d8d6b8 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -58,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
        struct nilfs_inode_info *ii = NILFS_I(inode);
        __u64 blknum = 0;
        int err = 0, ret;
-        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
        unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
        down_read(&NILFS_MDT(dat)->mi_sem);
@@ -96,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                                       inode->i_ino,
                                       (unsigned long long)blkoff);
                                err = 0;
-                        } else if (err == -EINVAL) {
-                                nilfs_error(inode->i_sb, __func__,
-                                            "broken bmap (inode=%lu)\n",
-                                            inode->i_ino);
-                                err = -EIO;
                        }
                        nilfs_transaction_abort(inode->i_sb);
                        goto out;
@@ -109,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                nilfs_transaction_commit(inode->i_sb); /* never fails */
                /* Error handling should be detailed */
                set_buffer_new(bh_result);
+                set_buffer_delay(bh_result);
                map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
                                                      to proper value */
        } else if (ret == -ENOENT) {
@@ -185,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page)
        if (ret) {
                struct inode *inode = page->mapping->host;
-                struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
                unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
-                nilfs_set_file_dirty(sbi, inode, nr_dirty);
+                nilfs_set_file_dirty(inode, nr_dirty);
        }
        return ret;
 }
@@ -229,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
                                                  start + copied);
        copied = generic_write_end(file, mapping, pos, len, copied, page,
                                   fsdata);
-        nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+        nilfs_set_file_dirty(inode, nr_dirty);
        err = nilfs_transaction_commit(inode->i_sb);
        return err ? : copied;
 }
@@ -425,13 +420,12 @@ static int __nilfs_read_inode(struct super_block *sb,
                              struct nilfs_root *root, unsigned long ino,
                              struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
-        struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
        struct buffer_head *bh;
        struct nilfs_inode *raw_inode;
        int err;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
        if (unlikely(err))
                goto bad_inode;
@@ -461,7 +455,7 @@ static int __nilfs_read_inode(struct super_block *sb,
        }
        nilfs_ifile_unmap_inode(root->ifile, ino, bh);
        brelse(bh);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        nilfs_set_inode_flags(inode);
        return 0;
@@ -470,7 +464,7 @@ static int __nilfs_read_inode(struct super_block *sb,
        brelse(bh);
 bad_inode:
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        return err;
 }
@@ -629,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
        if (!test_bit(NILFS_I_BMAP, &ii->i_state))
                return;
- repeat:
+repeat:
        ret = nilfs_bmap_last_key(ii->i_bmap, &b);
        if (ret == -ENOENT)
                return;
@@ -646,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
                     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
                goto repeat;
- failed:
+failed:
-        if (ret == -EINVAL)
+        nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                nilfs_error(ii->vfs_inode.i_sb, __func__,
+                      "failed to truncate bmap (ino=%lu, err=%d)",
-                            "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
+                      ii->vfs_inode.i_ino, ret);
-        else
-                nilfs_warning(ii->vfs_inode.i_sb, __func__,
-                              "failed to truncate bmap (ino=%lu, err=%d)",
-                              ii->vfs_inode.i_ino, ret);
 }
 void nilfs_truncate(struct inode *inode)
@@ -682,7 +672,7 @@ void nilfs_truncate(struct inode *inode)
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        nilfs_mark_inode_dirty(inode);
-        nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+        nilfs_set_file_dirty(inode, 0);
        nilfs_transaction_commit(sb);
        /* May construct a logical segment and may fail in sync mode.
           But truncate has no return value. */
@@ -800,9 +790,9 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
        return generic_permission(inode, mask, flags, NULL);
 }
-int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
-                           struct buffer_head **pbh)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;
@@ -843,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode)
        return ret;
 }
-int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
-                         unsigned nr_dirty)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
        atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
@@ -878,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
 int nilfs_mark_inode_dirty(struct inode *inode)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct buffer_head *ibh;
        int err;
-        err = nilfs_load_inode_block(sbi, inode, &ibh);
+        err = nilfs_load_inode_block(inode, &ibh);
        if (unlikely(err)) {
                nilfs_warning(inode->i_sb, __func__,
                              "failed to reget inode block.\n");
@@ -924,3 +913,134 @@ void nilfs_dirty_inode(struct inode *inode)
        nilfs_mark_inode_dirty(inode);
        nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 __u64 start, __u64 len)
+{
+        struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
+        __u64 logical = 0, phys = 0, size = 0;
+        __u32 flags = 0;
+        loff_t isize;
+        sector_t blkoff, end_blkoff;
+        sector_t delalloc_blkoff;
+        unsigned long delalloc_blklen;
+        unsigned int blkbits = inode->i_blkbits;
+        int ret, n;
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        isize = i_size_read(inode);
+        blkoff = start >> blkbits;
+        end_blkoff = (start + len - 1) >> blkbits;
+        delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
+                                                        &delalloc_blkoff);
+        do {
+                __u64 blkphy;
+                unsigned int maxblocks;
+                if (delalloc_blklen && blkoff == delalloc_blkoff) {
+                        if (size) {
+                                /* End of the current extent */
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                        }
+                        if (blkoff > end_blkoff)
+                                break;
+                        flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
+                        logical = blkoff << blkbits;
+                        phys = 0;
+                        size = delalloc_blklen << blkbits;
+                        blkoff = delalloc_blkoff + delalloc_blklen;
+                        delalloc_blklen = nilfs_find_uncommitted_extent(
+                                inode, blkoff, &delalloc_blkoff);
+                        continue;
+                }
+                /*
+                 * Limit the number of blocks that we look up so as
+                 * not to get into the next delayed allocation extent.
+                 */
+                maxblocks = INT_MAX;
+                if (delalloc_blklen)
+                        maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
+                                          maxblocks);
+                blkphy = 0;
+                down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                n = nilfs_bmap_lookup_contig(
+                        NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
+                up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+                if (n < 0) {
+                        int past_eof;
+                        if (unlikely(n != -ENOENT))
+                                break; /* error */
+                        /* HOLE */
+                        blkoff++;
+                        past_eof = ((blkoff << blkbits) >= isize);
+                        if (size) {
+                                /* End of the current extent */
+                                if (past_eof)
+                                        flags |= FIEMAP_EXTENT_LAST;
+                                ret = fiemap_fill_next_extent(
+                                        fieinfo, logical, phys, size, flags);
+                                if (ret)
+                                        break;
+                                size = 0;
+                        }
+                        if (blkoff > end_blkoff || past_eof)
+                                break;
+                } else {
+                        if (size) {
+                                if (phys && blkphy << blkbits == phys + size) {
+                                        /* The current extent goes on */
+                                        size += n << blkbits;
+                                } else {
+                                        /* Terminate the current extent */
+                                        ret = fiemap_fill_next_extent(
+                                                fieinfo, logical, phys, size,
+                                                flags);
+                                        if (ret || blkoff > end_blkoff)
+                                                break;
+                                        /* Start another extent */
+                                        flags = FIEMAP_EXTENT_MERGED;
+                                        logical = blkoff << blkbits;
+                                        phys = blkphy << blkbits;
+                                        size = n << blkbits;
+                                }
+                        } else {
+                                /* Start a new extent */
+                                flags = FIEMAP_EXTENT_MERGED;
+                                logical = blkoff << blkbits;
+                                phys = blkphy << blkbits;
+                                size = n << blkbits;
+                        }
+                        blkoff += n;
+                }
+                cond_resched();
+        } while (true);
+        /* If ret is 1 then we just hit the end of the extent array */
+        if (ret == 1)
+                ret = 0;
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b185e937a335..496738963fdb 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -233,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
+        ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -242,8 +242,7 @@ static ssize_t
 nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
                          void *buf, size_t size, size_t nmembs)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -421,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
        size_t nmembs = argv->v_nmembs;
        int ret;
-        ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+        ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
        return (ret < 0) ? ret : nmembs;
 }
@@ -430,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                                         struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
-        struct inode *dat = nilfs_dat_inode(nilfs);
+        struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
-        struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
        struct nilfs_bdesc *bdescs = buf;
        int ret, i;
@@ -450,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
                        /* skip dead block */
                        continue;
                if (bdescs[i].bd_level == 0) {
-                        ret = nilfs_mdt_mark_block_dirty(dat,
+                        ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
                                                         bdescs[i].bd_offset);
                        if (ret < 0) {
                                WARN_ON(ret == -ENOENT);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 39a5b84e2c9f..6a0e2a189f60 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -237,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
 *
 * %-ENOENT - the specified block does not exist (hole block)
 *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
- *
 * %-EROFS - Read only filesystem (for create mode)
 */
 int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
@@ -273,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
 * %-ENOMEM - Insufficient memory available.
 *
 * %-EIO - I/O error
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
 {
@@ -350,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 * %-EIO - I/O error
 *
 * %-ENOENT - the specified block does not exist (hole block)
- *
- * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
 */
 int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
 {
@@ -499,31 +493,29 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
        struct buffer_head *bh_frozen;
        struct page *page;
        int blkbits = inode->i_blkbits;
-        int ret = -ENOMEM;
        page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
        if (!page)
-                return ret;
+                return -ENOMEM;
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << blkbits, 0);
        bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
-        if (bh_frozen) {
-                if (!buffer_uptodate(bh_frozen))
+        if (!buffer_uptodate(bh_frozen))
-                        nilfs_copy_buffer(bh_frozen, bh);
+                nilfs_copy_buffer(bh_frozen, bh);
-                if (list_empty(&bh_frozen->b_assoc_buffers)) {
+        if (list_empty(&bh_frozen->b_assoc_buffers)) {
-                        list_add_tail(&bh_frozen->b_assoc_buffers,
+                list_add_tail(&bh_frozen->b_assoc_buffers,
-                                      &shadow->frozen_buffers);
+                              &shadow->frozen_buffers);
-                        set_buffer_nilfs_redirected(bh);
+                set_buffer_nilfs_redirected(bh);
-                } else {
+        } else {
-                        brelse(bh_frozen); /* already frozen */
+                brelse(bh_frozen); /* already frozen */
-                }
-                ret = 0;
        }
        unlock_page(page);
        page_cache_release(page);
-        return ret;
+        return 0;
 }
 struct buffer_head *
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 6e9557ecf161..98034271cd02 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -577,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = {
        .rename         = nilfs_rename,
        .setattr        = nilfs_setattr,
        .permission     = nilfs_permission,
+        .fiemap         = nilfs_fiemap,
 };
 const struct inode_operations nilfs_special_inode_operations = {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 0ca98823db59..777e8fd04304 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -190,11 +190,6 @@ static inline int nilfs_doing_construction(void)
        return nilfs_test_transaction_flag(NILFS_TI_WRITER);
 }
-static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
-{
-        return nilfs->ns_dat;
-}
 /*
 * function prototype
 */
@@ -257,13 +252,13 @@ extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
 int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
-extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
+int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
-                                  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
-extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
+int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
-                                unsigned);
 extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
+int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 __u64 start, __u64 len);
 /* super.c */
 extern struct inode *nilfs_alloc_inode(struct super_block *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a6c3c2e817f8..0c432416cfef 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -491,7 +491,7 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        }
        return nc;
 }
- 
 void nilfs_mapping_init_once(struct address_space *mapping)
 {
        memset(mapping, 0, sizeof(*mapping));
@@ -546,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page)
        }
        return TestClearPageDirty(page);
 }
+/**
+ * nilfs_find_uncommitted_extent - find extent of uncommitted data
+ * @inode: inode
+ * @start_blk: start block offset (in)
+ * @blkoff: start offset of the found extent (out)
+ *
+ * This function searches an extent of buffers marked "delayed" which
+ * starts from a block offset equal to or larger than @start_blk.  If
+ * such an extent was found, this will store the start offset in
+ * @blkoff and return its length in blocks.  Otherwise, zero is
+ * returned.
+ */
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff)
+{
+        unsigned int i;
+        pgoff_t index;
+        unsigned int nblocks_in_page;
+        unsigned long length = 0;
+        sector_t b;
+        struct pagevec pvec;
+        struct page *page;
+        if (inode->i_mapping->nrpages == 0)
+                return 0;
+        index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+repeat:
+        pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
+                                        pvec.pages);
+        if (pvec.nr == 0)
+                return length;
+        if (length > 0 && pvec.pages[0]->index > index)
+                goto out;
+        b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        i = 0;
+        do {
+                page = pvec.pages[i];
+                lock_page(page);
+                if (page_has_buffers(page)) {
+                        struct buffer_head *bh, *head;
+                        bh = head = page_buffers(page);
+                        do {
+                                if (b < start_blk)
+                                        continue;
+                                if (buffer_delay(bh)) {
+                                        if (length == 0)
+                                                *blkoff = b;
+                                        length++;
+                                } else if (length > 0) {
+                                        goto out_locked;
+                                }
+                        } while (++b, bh = bh->b_this_page, bh != head);
+                } else {
+                        if (length > 0)
+                                goto out_locked;
+                        b += nblocks_in_page;
+                }
+                unlock_page(page);
+        } while (++i < pagevec_count(&pvec));
+        index = page->index + 1;
+        pagevec_release(&pvec);
+        cond_resched();
+        goto repeat;
+out_locked:
+        unlock_page(page);
+out:
+        pagevec_release(&pvec);
+        return length;
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index fb9e8a8a2038..622df27cd891 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -66,6 +66,9 @@ void nilfs_mapping_init(struct address_space *mapping,
                        struct backing_dev_info *bdi,
                        const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
+                                            sector_t start_blk,
+                                            sector_t *blkoff);
 #define NILFS_PAGE_BUG(page, m, a...) \
        do { nilfs_page_bug(page); BUG(); } while (0)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 5d2711c28da7..3dfcd3b7d389 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -535,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
                if (unlikely(err))
                        goto failed_page;
-                err = nilfs_set_file_dirty(sbi, inode, 1);
+                err = nilfs_set_file_dirty(inode, 1);
                if (unlikely(err))
                        goto failed_page;
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 35a07157b980..7a17715f215f 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -27,14 +27,6 @@
 #include <linux/types.h>
 #include <linux/fs.h>
-/*
- * Mount options
- */
-struct nilfs_mount_options {
-        unsigned long mount_opt;
-        __u64 snapshot_cno;
-};
 struct the_nilfs;
 struct nilfs_sc_info;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 687d090cea34..55ebae5c7f39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -504,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
        return err;
 }
-static int nilfs_handle_bmap_error(int err, const char *fname,
-                                   struct inode *inode, struct super_block *sb)
-{
-        if (err == -EINVAL) {
-                nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
-                            inode->i_ino);
-                err = -EIO;
-        }
-        return err;
-}
 /*
 * Callback functions that enumerate, mark, and collect dirty blocks
 */
@@ -524,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode,
                                           sizeof(struct nilfs_binfo_v));
@@ -539,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
                                   struct buffer_head *bh,
                                   struct inode *inode)
 {
-        int err;
+        return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
-                return nilfs_handle_bmap_error(err, __func__, inode,
-                                               sci->sc_super);
-        return 0;
 }
 static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
@@ -588,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
        int err;
        err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
-        if (unlikely(err < 0))
+        if (err < 0)
-                return nilfs_handle_bmap_error(err, __func__, inode,
+                return err;
-                                               sci->sc_super);
        err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
        if (!err)
@@ -776,9 +757,8 @@ static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
                ret++;
        if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
                ret++;
-        if (ret || nilfs_doing_gc())
+        if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
-                if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
+                ret++;
-                        ret++;
        return ret;
 }
@@ -814,7 +794,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
        nilfs_mdt_clear_dirty(sci->sc_root->ifile);
        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
        nilfs_mdt_clear_dirty(nilfs->ns_sufile);
-        nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+        nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
@@ -923,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
        raw_sr->sr_flags = 0;
-        nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
+        nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
                                 NILFS_SR_DAT_OFFSET(isz), 1);
        nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
                                 NILFS_SR_CPFILE_OFFSET(isz), 1);
@@ -1179,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                sci->sc_stage.scnt++;  /* Fall through */
        case NILFS_ST_DAT:
 dat_stage:
-                err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+                err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
                                              &nilfs_sc_dat_ops);
                if (unlikely(err))
                        break;
@@ -1563,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
        return 0;
 failed_bmap:
-        err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
        return err;
 }
@@ -1783,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
                                if (!err) {
                                        set_buffer_uptodate(bh);
                                        clear_buffer_dirty(bh);
+                                        clear_buffer_delay(bh);
                                        clear_buffer_nilfs_volatile(bh);
                                }
                                brelse(bh); /* for b_assoc_buffers */
@@ -1909,6 +1889,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                                    b_assoc_buffers) {
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
+                        clear_buffer_delay(bh);
                        clear_buffer_nilfs_volatile(bh);
                        clear_buffer_nilfs_redirected(bh);
                        if (bh == segbuf->sb_super_root) {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e2dcc9c733f7..70dfdd532b83 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -47,7 +47,6 @@
 #include <linux/crc32.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
-#include <linux/kobject.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
@@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function,
                 const char *fmt, ...)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
-        printk("\n");
+        vaf.va = &args;
+        printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -136,13 +140,17 @@ void nilfs_error(struct super_block *sb, const char *function,
 void nilfs_warning(struct super_block *sb, const char *function,
                   const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "NILFS warning (device %s): %s: ",
-               sb->s_id, function);
+        vaf.fmt = fmt;
-        vprintk(fmt, args);
+        vaf.va = &args;
-        printk("\n");
+        printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
+               sb->s_id, function, &vaf);
        va_end(args);
 }
@@ -1010,11 +1018,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
-        struct nilfs_mount_options old_opts;
+        unsigned long old_mount_opt;
        int err;
        old_sb_flags = sb->s_flags;
-        old_opts.mount_opt = sbi->s_mount_opt;
+        old_mount_opt = sbi->s_mount_opt;
        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
@@ -1083,7 +1091,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
-        sbi->s_mount_opt = old_opts.mount_opt;
+        sbi->s_mount_opt = old_mount_opt;
        return err;
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0254be2d73c6..ad4ac607cf57 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -329,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        printk(KERN_INFO "NILFS: recovery complete.\n");
 skip_recovery:
-        set_nilfs_loaded(nilfs);
        nilfs_clear_recovery_info(&ri);
        sbi->s_super->s_flags = s_flags;
        return 0;
@@ -651,12 +650,11 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
-        struct inode *dat = nilfs_dat_inode(nilfs);
        unsigned long ncleansegs;
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
        return 0;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 69226e14b745..fd85e4c05c6b 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -36,8 +36,6 @@
 /* the_nilfs struct */
 enum {
        THE_NILFS_INIT = 0,     /* Information from super_block is set */
-        THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
-                                   the latest checkpoint was loaded */
        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
        THE_NILFS_GC_RUNNING,   /* gc process is running */
        THE_NILFS_SB_DIRTY,     /* super block is dirty */
@@ -178,7 +176,6 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 }
 THE_NILFS_FNS(INIT, init)
-THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698e..ab152c00cd3a 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -51,7 +51,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
 config OCFS2_FS_STATS
        bool "OCFS2 statistics"
-        depends on OCFS2_FS
+        depends on OCFS2_FS && DEBUG_FS
        default y
        help
          This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5007d1..e4984e259cb6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                         struct ocfs2_extent_block *eb);
 static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, tl_bh);
+        osb->truncated_clusters += num_clusters;
 bail:
        mlog_exit(status);
        return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                i--;
        }
+        osb->truncated_clusters = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -7139,64 +7141,6 @@ bail:
 }
 /*
- * Expects the inode to already be locked.
- */
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc)
-{
-        int status;
-        unsigned int new_i_clusters;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
-        struct buffer_head *last_eb_bh = NULL;
-        mlog_entry_void();
-        *tc = NULL;
-        new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
-                                                  i_size_read(inode));
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-             "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
-             (unsigned long long)le64_to_cpu(fe->i_size));
-        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
-        if (!(*tc)) {
-                status = -ENOMEM;
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_extent_block(INODE_CACHE(inode),
-                                                 le64_to_cpu(fe->i_last_eb_blk),
-                                                 &last_eb_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        }
-        (*tc)->tc_last_eb_bh = last_eb_bh;
-        status = 0;
-bail:
-        if (status < 0) {
-                if (*tc)
-                        ocfs2_free_truncate_context(*tc);
-                *tc = NULL;
-        }
-        mlog_exit_void();
-        return status;
-}
-/*
 * 'start' is inclusive, 'end' is not.
 */
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
 out:
        return ret;
 }
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
-{
-        /*
-         * The caller is responsible for completing deallocation
-         * before freeing the context.
-         */
-        if (tc->tc_dealloc.c_first_suballocator != NULL)
-                mlog(ML_NOTICE,
-                     "Truncate completion has non-empty dealloc context\n");
-        brelse(tc->tc_last_eb_bh);
-        kfree(tc);
-}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b554b99..3bd08a03251c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
                                  u64 range_start, u64 range_end);
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
-                           struct inode *inode,
-                           struct buffer_head *fe_bh,
-                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
                          struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0d7c5540ad66..1fbb0e20131b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1630,6 +1630,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
        return ret;
 }
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+                                          unsigned int needed)
+{
+        tid_t target;
+        int ret = 0;
+        unsigned int truncated_clusters;
+        mutex_lock(&osb->osb_tl_inode->i_mutex);
+        truncated_clusters = osb->truncated_clusters;
+        mutex_unlock(&osb->osb_tl_inode->i_mutex);
+        /*
+         * Check whether we can succeed in allocating if we free
+         * the truncate log.
+         */
+        if (truncated_clusters < needed)
+                goto out;
+        ret = ocfs2_flush_truncate_log(osb);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+                jbd2_log_wait_commit(osb->journal->j_journal, target);
+                ret = 1;
+        }
+out:
+        return ret;
+}
 int ocfs2_write_begin_nolock(struct file *filp,
                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
@@ -1637,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
        int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
-        unsigned int clusters_to_alloc, extents_to_split;
+        unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
        struct ocfs2_write_ctxt *wc;
        struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1646,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct file *filp,
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+        int try_free = 1, ret1;
+try_again:
        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
        if (ret) {
                mlog_errno(ret);
@@ -1681,6 +1720,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                mlog_errno(ret);
                goto out;
        } else if (ret == 1) {
+                clusters_need = wc->w_clen;
                ret = ocfs2_refcount_cow(inode, filp, di_bh,
                                         wc->w_cpos, wc->w_clen, UINT_MAX);
                if (ret) {
@@ -1695,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
                mlog_errno(ret);
                goto out;
        }
+        clusters_need += clusters_to_alloc;
        di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
@@ -1817,6 +1858,22 @@ out:
                ocfs2_free_alloc_context(data_ac);
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        if (ret == -ENOSPC && try_free) {
+                /*
+                 * Try to free some truncate log so that we can have enough
+                 * clusters to allocate.
+                 */
+                try_free = 0;
+                ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+                if (ret1 == 1)
+                        goto try_again;
+                if (ret1 < 0)
+                        mlog_errno(ret1);
+        }
        return ret;
 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9e3d45bcb5fd..a6cc05302e9f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
 #define O2HB_DB_TYPE_REGION_LIVENODES   4
 #define O2HB_DB_TYPE_REGION_NUMBER      5
 #define O2HB_DB_TYPE_REGION_ELAPSED_TIME        6
+#define O2HB_DB_TYPE_REGION_PINNED      7
 struct o2hb_debug_buf {
        int db_type;
        int db_size;
@@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
 #define O2HB_DEBUG_FAILEDREGIONS        "failed_regions"
 #define O2HB_DEBUG_REGION_NUMBER        "num"
 #define O2HB_DEBUG_REGION_ELAPSED_TIME  "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED        "pinned"
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
@@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
 unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF                3
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
 /* Only sets a new threshold if there are no active regions.
 *
 * No locking or otherwise interesting code is required for reading
@@ -186,7 +215,9 @@ struct o2hb_region {
        struct config_item      hr_item;
        struct list_head        hr_all_item;
-        unsigned                hr_unclean_stop:1;
+        unsigned                hr_unclean_stop:1,
+                                hr_item_pinned:1,
+                                hr_item_dropped:1;
        /* protected by the hr_callback_sem */
        struct task_struct      *hr_task;
@@ -212,9 +243,11 @@ struct o2hb_region {
        struct dentry           *hr_debug_livenodes;
        struct dentry           *hr_debug_regnum;
        struct dentry           *hr_debug_elapsed_time;
+        struct dentry           *hr_debug_pinned;
        struct o2hb_debug_buf   *hr_db_livenodes;
        struct o2hb_debug_buf   *hr_db_regnum;
        struct o2hb_debug_buf   *hr_db_elapsed_time;
+        struct o2hb_debug_buf   *hr_db_pinned;
        /* let the person setting up hb wait for it to return until it
         * has reached a 'steady' state.  This will be fixed when we have
@@ -701,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
               config_item_name(&reg->hr_item));
        set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+        /*
+         * If global heartbeat active, unpin all regions if the
+         * region count > CUT_OFF
+         */
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+                o2hb_region_unpin(NULL);
 }
 static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1041,6 +1082,9 @@ static int o2hb_thread(void *data)
        set_user_nice(current, -20);
+        /* Pin node */
+        o2nm_depend_this_node();
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
                 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -1090,6 +1134,9 @@ static int o2hb_thread(void *data)
                mlog_errno(ret);
        }
+        /* Unpin node */
+        o2nm_undepend_this_node();
        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
        return 0;
@@ -1142,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
                                                 reg->hr_last_timeout_start));
                goto done;
+        case O2HB_DB_TYPE_REGION_PINNED:
+                reg = (struct o2hb_region *)db->db_data;
+                out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+                                !!reg->hr_item_pinned);
+                goto done;
        default:
                goto done;
        }
@@ -1315,6 +1368,8 @@ int o2hb_init(void)
        memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
        memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+        o2hb_dependent_users = 0;
        return o2hb_debug_init();
 }
@@ -1384,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item)
        debugfs_remove(reg->hr_debug_livenodes);
        debugfs_remove(reg->hr_debug_regnum);
        debugfs_remove(reg->hr_debug_elapsed_time);
+        debugfs_remove(reg->hr_debug_pinned);
        debugfs_remove(reg->hr_debug_dir);
        spin_lock(&o2hb_live_lock);
@@ -1948,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                goto bail;
        }
+        reg->hr_debug_pinned =
+                        o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+                                          reg->hr_debug_dir,
+                                          &(reg->hr_db_pinned),
+                                          sizeof(*(reg->hr_db_pinned)),
+                                          O2HB_DB_TYPE_REGION_PINNED,
+                                          0, 0, reg);
+        if (!reg->hr_debug_pinned) {
+                mlog_errno(ret);
+                goto bail;
+        }
        ret = 0;
 bail:
        return ret;
@@ -2002,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 {
        struct task_struct *hb_task;
        struct o2hb_region *reg = to_o2hb_region(item);
+        int quorum_region = 0;
        /* stop the thread when the user removes the region dir */
        spin_lock(&o2hb_live_lock);
        if (o2hb_global_heartbeat_active()) {
                clear_bit(reg->hr_region_num, o2hb_region_bitmap);
                clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+                if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                        quorum_region = 1;
+                clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
        }
        hb_task = reg->hr_task;
        reg->hr_task = NULL;
+        reg->hr_item_dropped = 1;
        spin_unlock(&o2hb_live_lock);
        if (hb_task)
@@ -2028,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        if (o2hb_global_heartbeat_active())
                printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
                       config_item_name(&reg->hr_item));
        config_item_put(item);
+        if (!o2hb_global_heartbeat_active() || !quorum_region)
+                return;
+        /*
+         * If global heartbeat active and there are dependent users,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        spin_lock(&o2hb_live_lock);
+        if (!o2hb_dependent_users)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
 }
 struct o2hb_heartbeat_group_attribute {
@@ -2214,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
 {
-        struct o2hb_region *p, *reg = NULL;
+        int ret = 0, found = 0;
+        struct o2hb_region *reg;
+        char *uuid;
        assert_spin_locked(&o2hb_live_lock);
-        list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-                if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+                uuid = config_item_name(&reg->hr_item);
-                        reg = p;
-                        break;
+                /* local heartbeat */
+                if (region_uuid) {
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
+                }
+                if (reg->hr_item_pinned || reg->hr_item_dropped)
+                        goto skip_pin;
+                /* Ignore ENOENT only for local hb (userdlm domain) */
+                ret = o2nm_depend_item(&reg->hr_item);
+                if (!ret) {
+                        mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+                        reg->hr_item_pinned = 1;
+                } else {
+                        if (ret == -ENOENT && found)
+                                ret = 0;
+                        else {
+                                mlog(ML_ERROR, "Pin region %s fails with %d\n",
+                                     uuid, ret);
+                                break;
+                        }
                }
+skip_pin:
+                if (found)
+                        break;
        }
-        return reg;
+        return ret;
 }
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
 {
-        int ret = 0;
        struct o2hb_region *reg;
+        char *uuid;
+        int found = 0;
-        spin_lock(&o2hb_live_lock);
+        assert_spin_locked(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
-        if (!reg)
+                uuid = config_item_name(&reg->hr_item);
-                ret = -ENOENT;
+                if (region_uuid) {
-        spin_unlock(&o2hb_live_lock);
+                        if (strcmp(region_uuid, uuid))
+                                continue;
+                        found = 1;
+                }
-        if (ret)
+                if (reg->hr_item_pinned) {
-                goto out;
+                        mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+                        o2nm_undepend_item(&reg->hr_item);
+                        reg->hr_item_pinned = 0;
+                }
+                if (found)
+                        break;
+        }
+}
-        ret = o2nm_depend_this_node();
+static int o2hb_region_inc_user(const char *region_uuid)
-        if (ret)
+{
-                goto out;
+        int ret = 0;
-        ret = o2nm_depend_item(&reg->hr_item);
+        spin_lock(&o2hb_live_lock);
-        if (ret)
-                o2nm_undepend_this_node();
-out:
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            ret = o2hb_region_pin(region_uuid);
+            goto unlock;
+        }
+        /*
+         * if global heartbeat active and this is the first dependent user,
+         * pin all regions if quorum region count <= CUT_OFF
+         */
+        o2hb_dependent_users++;
+        if (o2hb_dependent_users > 1)
+                goto unlock;
+        if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+                           O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+                ret = o2hb_region_pin(NULL);
+unlock:
+        spin_unlock(&o2hb_live_lock);
        return ret;
 }
-static void o2hb_region_put(const char *region_uuid)
+void o2hb_region_dec_user(const char *region_uuid)
 {
-        struct o2hb_region *reg;
        spin_lock(&o2hb_live_lock);
-        reg = o2hb_find_region(region_uuid);
+        /* local heartbeat */
+        if (!o2hb_global_heartbeat_active()) {
+            o2hb_region_unpin(region_uuid);
+            goto unlock;
+        }
-        spin_unlock(&o2hb_live_lock);
+        /*
+         * if global heartbeat active and there are no dependent users,
+         * unpin all quorum regions
+         */
+        o2hb_dependent_users--;
+        if (!o2hb_dependent_users)
+                o2hb_region_unpin(NULL);
-        if (reg) {
+unlock:
-                o2nm_undepend_item(&reg->hr_item);
+        spin_unlock(&o2hb_live_lock);
-                o2nm_undepend_this_node();
-        }
 }
 int o2hb_register_callback(const char *region_uuid,
@@ -2291,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
        }
        if (region_uuid) {
-                ret = o2hb_region_get(region_uuid);
+                ret = o2hb_region_inc_user(region_uuid);
-                if (ret)
+                if (ret) {
+                        mlog_errno(ret);
                        goto out;
+                }
        }
        down_write(&o2hb_callback_sem);
@@ -2311,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
        up_write(&o2hb_callback_sem);
        ret = 0;
 out:
-        mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
             ret, __builtin_return_address(0), hc);
        return ret;
 }
@@ -2322,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
-        mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+        mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
             __builtin_return_address(0), hc);
        /* XXX Can this happen _with_ a region reference? */
@@ -2330,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
                return;
        if (region_uuid)
-                o2hb_region_put(region_uuid);
+                o2hb_region_dec_user(region_uuid);
        down_write(&o2hb_callback_sem);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b02..3a5835904b3d 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
 #define O2NET_DEBUG_DIR         "o2net"
 #define SC_DEBUG_NAME           "sock_containers"
 #define NST_DEBUG_NAME          "send_tracking"
+#define STATS_DEBUG_NAME        "stats"
+#define SHOW_SOCK_CONTAINERS    0
+#define SHOW_SOCK_STATS         1
 static struct dentry *o2net_dentry;
 static struct dentry *sc_dentry;
 static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
 static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static int nst_seq_show(struct seq_file *seq, void *v)
 {
        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        ktime_t now;
+        s64 sock, send, status;
        spin_lock(&o2net_debug_lock);
        nst = next_nst(dummy_nst);
+        if (!nst)
+                goto out;
-        if (nst != NULL) {
+        now = ktime_get();
-                /* get_task_comm isn't exported.  oh well. */
+        sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
-                seq_printf(seq, "%p:\n"
+        send = ktime_to_us(ktime_sub(now, nst->st_send_time));
-                           "  pid:          %lu\n"
+        status = ktime_to_us(ktime_sub(now, nst->st_status_time));
-                           "  tgid:         %lu\n"
-                           "  process name: %s\n"
+        /* get_task_comm isn't exported.  oh well. */
-                           "  node:         %u\n"
+        seq_printf(seq, "%p:\n"
-                           "  sc:           %p\n"
+                   "  pid:          %lu\n"
-                           "  message id:   %d\n"
+                   "  tgid:         %lu\n"
-                           "  message type: %u\n"
+                   "  process name: %s\n"
-                           "  message key:  0x%08x\n"
+                   "  node:         %u\n"
-                           "  sock acquiry: %lu.%ld\n"
+                   "  sc:           %p\n"
-                           "  send start:   %lu.%ld\n"
+                   "  message id:   %d\n"
-                           "  wait start:   %lu.%ld\n",
+                   "  message type: %u\n"
-                           nst, (unsigned long)nst->st_task->pid,
+                   "  message key:  0x%08x\n"
-                           (unsigned long)nst->st_task->tgid,
+                   "  sock acquiry: %lld usecs ago\n"
-                           nst->st_task->comm, nst->st_node,
+                   "  send start:   %lld usecs ago\n"
-                           nst->st_sc, nst->st_id, nst->st_msg_type,
+                   "  wait start:   %lld usecs ago\n",
-                           nst->st_msg_key,
+                   nst, (unsigned long)task_pid_nr(nst->st_task),
-                           nst->st_sock_time.tv_sec,
+                   (unsigned long)nst->st_task->tgid,
-                           (long)nst->st_sock_time.tv_usec,
+                   nst->st_task->comm, nst->st_node,
-                           nst->st_send_time.tv_sec,
+                   nst->st_sc, nst->st_id, nst->st_msg_type,
-                           (long)nst->st_send_time.tv_usec,
+                   nst->st_msg_key,
-                           nst->st_status_time.tv_sec,
+                   (long long)sock,
-                           (long)nst->st_status_time.tv_usec);
+                   (long long)send,
-        }
+                   (long long)status);
+out:
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
        spin_unlock(&o2net_debug_lock);
 }
+struct o2net_sock_debug {
+        int dbg_ctxt;
+        struct o2net_sock_container *dbg_sock;
+};
 static struct o2net_sock_container
                        *next_sc(struct o2net_sock_container *sc_start)
 {
@@ -253,7 +268,8 @@ static struct o2net_sock_container
 static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
 static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        return sc; /* unused, just needs to be null when done */
 }
-#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s)              ((_s)->sc_send_count)
+# define sc_recv_count(_s)              ((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s)        (ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s)      (ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s)     (ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s)              (0U)
+# define sc_recv_count(_s)              (0U)
+# define sc_tv_acquiry_total_ns(_s)     (0LL)
+# define sc_tv_send_total_ns(_s)        (0LL)
+# define sc_tv_status_total_ns(_s)      (0LL)
+# define sc_tv_process_total_ns(_s)     (0LL)
+#endif
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION         1
+static void sc_show_sock_stats(struct seq_file *seq,
+                               struct o2net_sock_container *sc)
+{
+        if (!sc)
+                return;
+        seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+                   sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+                   (long long)sc_tv_acquiry_total_ns(sc),
+                   (long long)sc_tv_send_total_ns(sc),
+                   (long long)sc_tv_status_total_ns(sc),
+                   (unsigned long)sc_recv_count(sc),
+                   (long long)sc_tv_process_total_ns(sc));
+}
+static void sc_show_sock_container(struct seq_file *seq,
+                                   struct o2net_sock_container *sc)
+{
+        struct inet_sock *inet = NULL;
+        __be32 saddr = 0, daddr = 0;
+        __be16 sport = 0, dport = 0;
+        if (!sc)
+                return;
+        if (sc->sc_sock) {
+                inet = inet_sk(sc->sc_sock->sk);
+                /* the stack's structs aren't sparse endian clean */
+                saddr = (__force __be32)inet->inet_saddr;
+                daddr = (__force __be32)inet->inet_daddr;
+                sport = (__force __be16)inet->inet_sport;
+                dport = (__force __be16)inet->inet_dport;
+        }
+        /* XXX sigh, inet-> doesn't have sparse annotation so any
+         * use of it here generates a warning with -Wbitwise */
+        seq_printf(seq, "%p:\n"
+                   "  krefs:           %d\n"
+                   "  sock:            %pI4:%u -> "
+                                      "%pI4:%u\n"
+                   "  remote node:     %s\n"
+                   "  page off:        %zu\n"
+                   "  handshake ok:    %u\n"
+                   "  timer:           %lld usecs\n"
+                   "  data ready:      %lld usecs\n"
+                   "  advance start:   %lld usecs\n"
+                   "  advance stop:    %lld usecs\n"
+                   "  func start:      %lld usecs\n"
+                   "  func stop:       %lld usecs\n"
+                   "  func key:        0x%08x\n"
+                   "  func type:       %u\n",
+                   sc,
+                   atomic_read(&sc->sc_kref.refcount),
+                   &saddr, inet ? ntohs(sport) : 0,
+                   &daddr, inet ? ntohs(dport) : 0,
+                   sc->sc_node->nd_name,
+                   sc->sc_page_off,
+                   sc->sc_handshake_ok,
+                   (long long)ktime_to_us(sc->sc_tv_timer),
+                   (long long)ktime_to_us(sc->sc_tv_data_ready),
+                   (long long)ktime_to_us(sc->sc_tv_advance_start),
+                   (long long)ktime_to_us(sc->sc_tv_advance_stop),
+                   (long long)ktime_to_us(sc->sc_tv_func_start),
+                   (long long)ktime_to_us(sc->sc_tv_func_stop),
+                   sc->sc_msg_key,
+                   sc->sc_msg_type);
+}
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
-        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
        spin_lock(&o2net_debug_lock);
        sc = next_sc(dummy_sc);
-        if (sc != NULL) {
+        if (sc) {
-                struct inet_sock *inet = NULL;
+                if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+                        sc_show_sock_container(seq, sc);
-                __be32 saddr = 0, daddr = 0;
+                else
-                __be16 sport = 0, dport = 0;
+                        sc_show_sock_stats(seq, sc);
-                if (sc->sc_sock) {
-                        inet = inet_sk(sc->sc_sock->sk);
-                        /* the stack's structs aren't sparse endian clean */
-                        saddr = (__force __be32)inet->inet_saddr;
-                        daddr = (__force __be32)inet->inet_daddr;
-                        sport = (__force __be16)inet->inet_sport;
-                        dport = (__force __be16)inet->inet_dport;
-                }
-                /* XXX sigh, inet-> doesn't have sparse annotation so any
-                 * use of it here generates a warning with -Wbitwise */
-                seq_printf(seq, "%p:\n"
-                           "  krefs:           %d\n"
-                           "  sock:            %pI4:%u -> "
-                                              "%pI4:%u\n"
-                           "  remote node:     %s\n"
-                           "  page off:        %zu\n"
-                           "  handshake ok:    %u\n"
-                           "  timer:           %lu.%ld\n"
-                           "  data ready:      %lu.%ld\n"
-                           "  advance start:   %lu.%ld\n"
-                           "  advance stop:    %lu.%ld\n"
-                           "  func start:      %lu.%ld\n"
-                           "  func stop:       %lu.%ld\n"
-                           "  func key:        %u\n"
-                           "  func type:       %u\n",
-                           sc,
-                           atomic_read(&sc->sc_kref.refcount),
-                           &saddr, inet ? ntohs(sport) : 0,
-                           &daddr, inet ? ntohs(dport) : 0,
-                           sc->sc_node->nd_name,
-                           sc->sc_page_off,
-                           sc->sc_handshake_ok,
-                           TV_SEC_USEC(sc->sc_tv_timer),
-                           TV_SEC_USEC(sc->sc_tv_data_ready),
-                           TV_SEC_USEC(sc->sc_tv_advance_start),
-                           TV_SEC_USEC(sc->sc_tv_advance_stop),
-                           TV_SEC_USEC(sc->sc_tv_func_start),
-                           TV_SEC_USEC(sc->sc_tv_func_stop),
-                           sc->sc_msg_key,
-                           sc->sc_msg_type);
        }
        spin_unlock(&o2net_debug_lock);
        return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
        .show = sc_seq_show,
 };
-static int sc_fop_open(struct inode *inode, struct file *file)
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
 {
        struct o2net_sock_container *dummy_sc;
        struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
                goto out;
        seq = file->private_data;
-        seq->private = dummy_sc;
+        seq->private = sd;
+        sd->dbg_sock = dummy_sc;
        o2net_debug_add_sc(dummy_sc);
        dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
 static int sc_fop_release(struct inode *inode, struct file *file)
 {
        struct seq_file *seq = file->private_data;
-        struct o2net_sock_container *dummy_sc = seq->private;
+        struct o2net_sock_debug *sd = seq->private;
+        struct o2net_sock_container *dummy_sc = sd->dbg_sock;
        o2net_debug_del_sc(dummy_sc);
        return seq_release_private(inode, file);
 }
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_STATS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
+static const struct file_operations stats_seq_fops = {
+        .open = stats_fop_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = sc_fop_release,
+};
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_debug *sd;
+        sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+        if (sd == NULL)
+                return -ENOMEM;
+        sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+        sd->dbg_sock = NULL;
+        return sc_common_open(file, sd);
+}
 static const struct file_operations sc_seq_fops = {
        .open = sc_fop_open,
        .read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
                goto bail;
        }
+        stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                           o2net_dentry, NULL,
+                                           &stats_seq_fops);
+        if (!stats_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
        return 0;
 bail:
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
        return -ENOMEM;
 }
 void o2net_debugfs_exit(void)
 {
-        if (sc_dentry)
+        debugfs_remove(stats_dentry);
-                debugfs_remove(sc_dentry);
+        debugfs_remove(sc_dentry);
-        if (nst_dentry)
+        debugfs_remove(nst_dentry);
-                debugfs_remove(nst_dentry);
+        debugfs_remove(o2net_dentry);
-        if (o2net_dentry)
-                debugfs_remove(o2net_dentry);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9aa426e42123..3b11cb1e38fc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
        nst->st_node = node;
 }
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_sock_time);
+        nst->st_sock_time = ktime_get();
 }
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_send_time);
+        nst->st_send_time = ktime_get();
 }
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-        do_gettimeofday(&nst->st_status_time);
+        nst->st_status_time = ktime_get();
 }
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-                                         struct o2net_sock_container *sc)
+                                                struct o2net_sock_container *sc)
 {
        nst->st_sc = sc;
 }
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+                                        u32 msg_id)
 {
        nst->st_id = msg_id;
 }
-#else  /* CONFIG_DEBUG_FS */
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                                  u32 msgkey, struct task_struct *task, u8 node)
 {
+        sc->sc_tv_timer = ktime_get();
 }
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_data_ready = ktime_get();
 }
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_start = ktime_get();
 }
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
 {
+        sc->sc_tv_advance_stop = ktime_get();
 }
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
-                                                struct o2net_sock_container *sc)
 {
+        sc->sc_tv_func_start = ktime_get();
 }
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
-                                        u32 msg_id)
 {
+        sc->sc_tv_func_stop = ktime_get();
 }
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+        return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+#else  /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+# define o2net_get_func_run_time(a)             (ktime_t)0
 #endif /* CONFIG_DEBUG_FS */
+#ifdef CONFIG_OCFS2_FS_STATS
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+                                    struct o2net_sock_container *sc)
+{
+        sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+                                           ktime_sub(ktime_get(),
+                                                     nst->st_status_time));
+        sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+                                         ktime_sub(nst->st_status_time,
+                                                   nst->st_send_time));
+        sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+                                            ktime_sub(nst->st_send_time,
+                                                      nst->st_sock_time));
+        sc->sc_send_count++;
+}
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+        sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+                                            o2net_get_func_run_time(sc));
+        sc->sc_recv_count++;
+}
+#else
+# define o2net_update_send_stats(a, b)
+# define o2net_update_recv_stats(sc)
+#endif /* CONFIG_OCFS2_FS_STATS */
 static inline int o2net_reconnect_delay(void)
 {
        return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
                sc->sc_sock = NULL;
        }
+        o2nm_undepend_item(&sc->sc_node->nd_item);
        o2nm_node_put(sc->sc_node);
        sc->sc_node = NULL;
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 {
        struct o2net_sock_container *sc, *ret = NULL;
        struct page *page = NULL;
+        int status = 0;
        page = alloc_page(GFP_NOFS);
        sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
        o2nm_node_get(node);
        sc->sc_node = node;
+        /* pin the node item of the remote node */
+        status = o2nm_depend_item(&node->nd_item);
+        if (status) {
+                mlog_errno(status);
+                o2nm_node_put(node);
+                goto out;
+        }
        INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
        INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
        INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
        if (sk->sk_user_data) {
                struct o2net_sock_container *sc = sk->sk_user_data;
                sclog(sc, "data_ready hit\n");
-                do_gettimeofday(&sc->sc_tv_data_ready);
+                o2net_set_data_ready_time(sc);
                o2net_sc_queue_work(sc, &sc->sc_rx_work);
                ready = sc->sc_data_ready;
        } else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        o2net_set_nst_status_time(&nst);
        wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+        o2net_update_send_stats(&nst, sc);
        /* Note that we avoid overwriting the callers status return
         * variable if a system error was reported on the other
         * side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
        if (syserr != O2NET_ERR_NONE)
                goto out_respond;
-        do_gettimeofday(&sc->sc_tv_func_start);
+        o2net_set_func_start_time(sc);
        sc->sc_msg_key = be32_to_cpu(hdr->key);
        sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
        handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
                                             be16_to_cpu(hdr->data_len),
                                        nmh->nh_func_data, &ret_data);
-        do_gettimeofday(&sc->sc_tv_func_stop);
+        o2net_set_func_stop_time(sc);
+        o2net_update_recv_stats(sc);
 out_respond:
        /* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
        size_t datalen;
        sclog(sc, "receiving\n");
-        do_gettimeofday(&sc->sc_tv_advance_start);
+        o2net_set_advance_start_time(sc);
        if (unlikely(sc->sc_handshake_ok == 0)) {
                if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
 out:
        sclog(sc, "ret = %d\n", ret);
-        do_gettimeofday(&sc->sc_tv_advance_stop);
+        o2net_set_advance_stop_time(sc);
        return ret;
 }
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
 {
        struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-        struct timeval now;
-        do_gettimeofday(&now);
+#ifdef CONFIG_DEBUG_FS
+        ktime_t now = ktime_get();
+#endif
        printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
                     o2net_idle_timeout() / 1000,
                     o2net_idle_timeout() % 1000);
-        mlog(ML_NOTICE, "here are some times that might help debug the "
-             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
+#ifdef CONFIG_DEBUG_FS
-             "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
+        mlog(ML_NOTICE, "Here are some times that might help debug the "
-             sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
+             "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
-             now.tv_sec, (long) now.tv_usec,
+             "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
-             sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
-             sc->sc_tv_advance_start.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_data_ready),
-             (long) sc->sc_tv_advance_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_advance_start),
-             sc->sc_tv_advance_stop.tv_sec,
+             (long long)ktime_to_us(sc->sc_tv_advance_stop),
-             (long) sc->sc_tv_advance_stop.tv_usec,
             sc->sc_msg_key, sc->sc_msg_type,
-             sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
+             (long long)ktime_to_us(sc->sc_tv_func_start),
-             sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+             (long long)ktime_to_us(sc->sc_tv_func_stop));
+#endif
        /*
         * Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
        o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
        o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
                      msecs_to_jiffies(o2net_keepalive_delay()));
-        do_gettimeofday(&sc->sc_tv_timer);
+        o2net_set_sock_timer(sc);
        mod_timer(&sc->sc_idle_timeout,
               jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 15fdbdf9eb4b..4cbcb65784a3 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -166,18 +166,27 @@ struct o2net_sock_container {
        /* original handlers for the sockets */
        void                    (*sc_state_change)(struct sock *sk);
        void                    (*sc_data_ready)(struct sock *sk, int bytes);
-#ifdef CONFIG_DEBUG_FS
-        struct list_head        sc_net_debug_item;
-#endif
-        struct timeval          sc_tv_timer;
-        struct timeval          sc_tv_data_ready;
-        struct timeval          sc_tv_advance_start;
-        struct timeval          sc_tv_advance_stop;
-        struct timeval          sc_tv_func_start;
-        struct timeval          sc_tv_func_stop;
        u32                     sc_msg_key;
        u16                     sc_msg_type;
+#ifdef CONFIG_DEBUG_FS
+        struct list_head        sc_net_debug_item;
+        ktime_t                 sc_tv_timer;
+        ktime_t                 sc_tv_data_ready;
+        ktime_t                 sc_tv_advance_start;
+        ktime_t                 sc_tv_advance_stop;
+        ktime_t                 sc_tv_func_start;
+        ktime_t                 sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+        ktime_t                 sc_tv_acquiry_total;
+        ktime_t                 sc_tv_send_total;
+        ktime_t                 sc_tv_status_total;
+        u32                     sc_send_count;
+        u32                     sc_recv_count;
+        ktime_t                 sc_tv_process_total;
+#endif
        struct mutex            sc_send_lock;
 };
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
        u32                             st_msg_type;
        u32                             st_msg_key;
        u8                              st_node;
-        struct timeval                  st_sock_time;
+        ktime_t                         st_sock_time;
-        struct timeval                  st_send_time;
+        ktime_t                         st_send_time;
-        struct timeval                  st_status_time;
+        ktime_t                         st_status_time;
 };
 #else
 struct o2net_send_tracking {
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f44999156839..3a3ed4bb794b 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
+        res = lock->lockres;
        assert_spin_locked(&dlm->ast_lock);
        if (!list_empty(&lock->ast_list)) {
-                mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
+                mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+                     "AST list not empty, pending %d, newlevel %d\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
                     lock->ast_pending, lock->ml.type);
                BUG();
        }
        if (lock->ast_pending)
-                mlog(0, "lock has an ast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        /* check to see if this ast obsoletes the bast */
        if (dlm_should_cancel_bast(dlm, lock)) {
-                struct dlm_lock_resource *res = lock->lockres;
+                mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
-                mlog(0, "%s: cancelling bast for %.*s\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
-                     dlm->name, res->lockname.len, res->lockname.name);
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
                lock->bast_pending = 0;
                list_del_init(&lock->bast_list);
                lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
+        struct dlm_lock_resource *res;
        BUG_ON(!dlm);
        BUG_ON(!lock);
        assert_spin_locked(&dlm->ast_lock);
+        res = lock->lockres;
        BUG_ON(!list_empty(&lock->bast_list));
        if (lock->bast_pending)
-                mlog(0, "lock has a bast getting flushed right now\n");
+                mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        /* putting lock on list, add a ref */
        dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
-        mlog_entry_void();
        BUG_ON(!dlm);
        BUG_ON(!lock);
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        dlm_astlockfunc_t *fn;
        struct dlm_lockstatus *lksb;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        struct dlm_lockstatus *lksb;
        int lksbflags;
-        mlog_entry_void();
+        mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+             res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
        lksb = lock->lksb;
        BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 {
        dlm_bastlockfunc_t *fn = lock->bast;
-        mlog_entry_void();
        BUG_ON(lock->ml.node != dlm->node_num);
+        mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+             dlm->name, res->lockname.len, res->lockname.name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+             blocked_type);
        (*fn)(lock->astdata, blocked_type);
 }
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        /* cannot get a proxy ast message if this node owns it */
        BUG_ON(res->owner == dlm->node_num);
-        mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
                list_move_tail(&lock->list, &res->granted);
-                mlog(0, "ast: Adding to granted list... type=%d, "
+                mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
-                     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+                     lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
                        lock->ml.type = lock->ml.convert_type;
                        lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        size_t veclen = 1;
        int status;
-        mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
+        mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
-                   res->lockname.len, res->lockname.name, lock->ml.node,
+             res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
-                   msg_type, blocked_type);
+             blocked_type);
        memset(&past, 0, sizeof(struct dlm_proxy_ast));
        past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        vec[0].iov_len = sizeof(struct dlm_proxy_ast);
        vec[0].iov_base = &past;
        if (flags & DLM_LKSB_GET_LVB) {
-                mlog(0, "returning requested LVB data\n");
                be32_add_cpu(&past.flags, LKM_GET_LVB);
                vec[1].iov_len = DLM_LVB_LEN;
                vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
-                     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                     dlm->name, res->lockname.len, res->lockname.name, ret,
                     lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index b36d0bf77a5a..4bdf7baee344 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 enum dlm_mle_type {
-        DLM_MLE_BLOCK,
+        DLM_MLE_BLOCK = 0,
-        DLM_MLE_MASTER,
+        DLM_MLE_MASTER = 1,
-        DLM_MLE_MIGRATION,
+        DLM_MLE_MIGRATION = 2,
-        DLM_MLE_NUM_TYPES
+        DLM_MLE_NUM_TYPES = 3,
 };
 struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
 enum dlm_ast_type {
        DLM_AST = 0,
-        DLM_BAST,
+        DLM_BAST = 1,
-        DLM_ASTUNLOCK
+        DLM_ASTUNLOCK = 2,
 };
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
 enum dlm_ctxt_state {
        DLM_CTXT_NEW = 0,
-        DLM_CTXT_JOINED,
+        DLM_CTXT_JOINED = 1,
-        DLM_CTXT_IN_SHUTDOWN,
+        DLM_CTXT_IN_SHUTDOWN = 2,
-        DLM_CTXT_LEAVING,
+        DLM_CTXT_LEAVING = 3,
 };
 struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
 enum dlm_lockres_list {
        DLM_GRANTED_LIST = 0,
-        DLM_CONVERTING_LIST,
+        DLM_CONVERTING_LIST = 1,
-        DLM_BLOCKED_LIST
+        DLM_BLOCKED_LIST = 2,
 };
 static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,27 +427,27 @@ struct dlm_node_iter
 enum {
-        DLM_MASTER_REQUEST_MSG    = 500,
+        DLM_MASTER_REQUEST_MSG          = 500,
-        DLM_UNUSED_MSG1,         /* 501 */
+        DLM_UNUSED_MSG1                 = 501,
-        DLM_ASSERT_MASTER_MSG,   /* 502 */
+        DLM_ASSERT_MASTER_MSG           = 502,
-        DLM_CREATE_LOCK_MSG,     /* 503 */
+        DLM_CREATE_LOCK_MSG             = 503,
-        DLM_CONVERT_LOCK_MSG,    /* 504 */
+        DLM_CONVERT_LOCK_MSG            = 504,
-        DLM_PROXY_AST_MSG,       /* 505 */
+        DLM_PROXY_AST_MSG               = 505,
-        DLM_UNLOCK_LOCK_MSG,     /* 506 */
+        DLM_UNLOCK_LOCK_MSG             = 506,
-        DLM_DEREF_LOCKRES_MSG,   /* 507 */
+        DLM_DEREF_LOCKRES_MSG           = 507,
-        DLM_MIGRATE_REQUEST_MSG, /* 508 */
+        DLM_MIGRATE_REQUEST_MSG         = 508,
-        DLM_MIG_LOCKRES_MSG,     /* 509 */
+        DLM_MIG_LOCKRES_MSG             = 509,
-        DLM_QUERY_JOIN_MSG,      /* 510 */
+        DLM_QUERY_JOIN_MSG              = 510,
-        DLM_ASSERT_JOINED_MSG,   /* 511 */
+        DLM_ASSERT_JOINED_MSG           = 511,
-        DLM_CANCEL_JOIN_MSG,     /* 512 */
+        DLM_CANCEL_JOIN_MSG             = 512,
-        DLM_EXIT_DOMAIN_MSG,     /* 513 */
+        DLM_EXIT_DOMAIN_MSG             = 513,
-        DLM_MASTER_REQUERY_MSG,  /* 514 */
+        DLM_MASTER_REQUERY_MSG          = 514,
-        DLM_LOCK_REQUEST_MSG,    /* 515 */
+        DLM_LOCK_REQUEST_MSG            = 515,
-        DLM_RECO_DATA_DONE_MSG,  /* 516 */
+        DLM_RECO_DATA_DONE_MSG          = 516,
-        DLM_BEGIN_RECO_MSG,      /* 517 */
+        DLM_BEGIN_RECO_MSG              = 517,
-        DLM_FINALIZE_RECO_MSG,   /* 518 */
+        DLM_FINALIZE_RECO_MSG           = 518,
-        DLM_QUERY_REGION,        /* 519 */
+        DLM_QUERY_REGION                = 519,
-        DLM_QUERY_NODEINFO,      /* 520 */
+        DLM_QUERY_NODEINFO              = 520,
 };
 struct dlm_reco_node_data
@@ -460,19 +460,19 @@ struct dlm_reco_node_data
 enum {
        DLM_RECO_NODE_DATA_DEAD = -1,
        DLM_RECO_NODE_DATA_INIT = 0,
-        DLM_RECO_NODE_DATA_REQUESTING,
+        DLM_RECO_NODE_DATA_REQUESTING = 1,
-        DLM_RECO_NODE_DATA_REQUESTED,
+        DLM_RECO_NODE_DATA_REQUESTED = 2,
-        DLM_RECO_NODE_DATA_RECEIVING,
+        DLM_RECO_NODE_DATA_RECEIVING = 3,
-        DLM_RECO_NODE_DATA_DONE,
+        DLM_RECO_NODE_DATA_DONE = 4,
-        DLM_RECO_NODE_DATA_FINALIZE_SENT,
+        DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
 };
 enum {
        DLM_MASTER_RESP_NO = 0,
-        DLM_MASTER_RESP_YES,
+        DLM_MASTER_RESP_YES = 1,
-        DLM_MASTER_RESP_MAYBE,
+        DLM_MASTER_RESP_MAYBE = 2,
-        DLM_MASTER_RESP_ERROR
+        DLM_MASTER_RESP_ERROR = 3,
 };
@@ -649,9 +649,9 @@ struct dlm_proxy_ast
 #define DLM_MOD_KEY (0x666c6172)
 enum dlm_query_join_response_code {
        JOIN_DISALLOW = 0,
-        JOIN_OK,
+        JOIN_OK = 1,
-        JOIN_OK_NO_MAP,
+        JOIN_OK_NO_MAP = 2,
-        JOIN_PROTOCOL_MISMATCH,
+        JOIN_PROTOCOL_MISMATCH = 3,
 };
 struct dlm_query_join_packet {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 272ec8631a51..04a32be0aeb9 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
        kref_get(&dc->debug_refcnt);
 }
-static struct debug_buffer *debug_buffer_allocate(void)
+static int debug_release(struct inode *inode, struct file *file)
 {
-        struct debug_buffer *db = NULL;
+        free_page((unsigned long)file->private_data);
+        return 0;
-        db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
-        if (!db)
-                goto bail;
-        db->len = PAGE_SIZE;
-        db->buf = kmalloc(db->len, GFP_KERNEL);
-        if (!db->buf)
-                goto bail;
-        return db;
-bail:
-        kfree(db);
-        return NULL;
-}
-static ssize_t debug_buffer_read(struct file *file, char __user *buf,
-                                 size_t nbytes, loff_t *ppos)
-{
-        struct debug_buffer *db = file->private_data;
-        return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
-}
-static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
-{
-        struct debug_buffer *db = file->private_data;
-        loff_t new = -1;
-        switch (whence) {
-        case 0:
-                new = off;
-                break;
-        case 1:
-                new = file->f_pos + off;
-                break;
-        }
-        if (new < 0 || new > db->len)
-                return -EINVAL;
-        return (file->f_pos = new);
 }
-static int debug_buffer_release(struct inode *inode, struct file *file)
+static ssize_t debug_read(struct file *file, char __user *buf,
+                          size_t nbytes, loff_t *ppos)
 {
-        struct debug_buffer *db = file->private_data;
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
-        if (db)
-                kfree(db->buf);
-        kfree(db);
-        return 0;
 }
 /* end - util funcs */
 /* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_lock_resource *res;
        int out = 0;
        unsigned long total = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping Purgelist for Domain: %s\n", dlm->name);
        spin_lock(&dlm->spinlock);
        list_for_each_entry(res, &dlm->purge_list, purge) {
                ++total;
-                if (db->len - out < 100)
+                if (len - out < 100)
                        continue;
                spin_lock(&res->spinlock);
                out += stringify_lockname(res->lockname.name,
                                          res->lockname.len,
-                                          db->buf + out, db->len - out);
+                                          buf + out, len - out);
-                out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+                out += snprintf(buf + out, len - out, "\t%ld\n",
                                (jiffies - res->last_used)/HZ);
                spin_unlock(&res->spinlock);
        }
        spin_unlock(&dlm->spinlock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
-                        "Total on list: %ld\n", total);
        return out;
 }
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_purgelist_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_purgelist_print(dlm, db);
+        i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -480,14 +434,14 @@ bail:
 static const struct file_operations debug_purgelist_fops = {
        .open =         debug_purgelist_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - purge list funcs */
 /* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        struct dlm_master_list_entry *mle;
        struct hlist_head *bucket;
@@ -495,7 +449,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        int i, out = 0;
        unsigned long total = 0, longest = 0, bucket_count = 0;
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
        spin_lock(&dlm->master_lock);
@@ -506,16 +460,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                                          master_hash_node);
                        ++total;
                        ++bucket_count;
-                        if (db->len - out < 200)
+                        if (len - out < 200)
                                continue;
-                        out += dump_mle(mle, db->buf + out, db->len - out);
+                        out += dump_mle(mle, buf + out, len - out);
                }
                longest = max(longest, bucket_count);
                bucket_count = 0;
        }
        spin_unlock(&dlm->master_lock);
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Total: %ld, Longest: %ld\n", total, longest);
        return out;
 }
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_mle_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_mle_print(dlm, db);
+        i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -540,9 +494,9 @@ bail:
 static const struct file_operations debug_mle_fops = {
        .open =         debug_mle_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
 /* end - debug lockres funcs */
 /* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
        int out = 0;
        struct dlm_reco_node_data *node;
@@ -781,35 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        }
        /* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
                        dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
                        dlm->dlm_locking_proto.pv_minor);
        /* Thread Pid: xxx  Node: xxx  State: xxxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Thread Pid: %d  Node: %d  State: %s\n",
-                        dlm->dlm_thread_task->pid, dlm->node_num, state);
+                        task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
        /* Number of Joins: xxx  Joining Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Number of Joins: %d  Joining Node: %d\n",
                        dlm->num_joins, dlm->joining_node);
        /* Domain Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+        out += snprintf(buf + out, len - out, "Domain Map: ");
        out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Live Map: xx xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+        out += snprintf(buf + out, len - out, "Live Map: ");
        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Lock Resources: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lock Resources: %d (%d)\n",
                        atomic_read(&dlm->res_cur_count),
                        atomic_read(&dlm->res_tot_count));
@@ -821,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                cur_mles += atomic_read(&dlm->mle_cur_count[i]);
        /* MLEs: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "MLEs: %d (%d)\n", cur_mles, tot_mles);
        /*  Blocking: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Blocking: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
        /*  Mastery: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Mastery: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
        /*  Migration: xxx (xxx) */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "  Migration: %d (%d)\n",
                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
                        "PendingBASTs=%s\n",
                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -852,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
        /* Purge Count: xxx  Refs: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Purge Count: %d  Refs: %d\n", dlm->purge_count,
                        atomic_read(&dlm->dlm_refs.refcount));
        /* Dead Node: xxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Dead Node: %d\n", dlm->reco.dead_node);
        /* What about DLM_RECO_STATE_FINALIZE? */
@@ -867,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                state = "INACTIVE";
        /* Recovery Pid: xxxx  Master: xxx  State: xxxx */
-        out += snprintf(db->buf + out, db->len - out,
+        out += snprintf(buf + out, len - out,
                        "Recovery Pid: %d  Master: %d  State: %s\n",
-                        dlm->dlm_reco_thread_task->pid,
+                        task_pid_nr(dlm->dlm_reco_thread_task),
                        dlm->reco.new_master, state);
        /* Recovery Map: xx xx */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+        out += snprintf(buf + out, len - out, "Recovery Map: ");
        out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
-                                 db->buf + out, db->len - out);
+                                 buf + out, len - out);
-        out += snprintf(db->buf + out, db->len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
        /* Recovery Node State: */
-        out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+        out += snprintf(buf + out, len - out, "Recovery Node State:\n");
        list_for_each_entry(node, &dlm->reco.node_data, list) {
                switch (node->state) {
                case DLM_RECO_NODE_DATA_INIT:
@@ -907,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        state = "BAD";
                        break;
                }
-                out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+                out += snprintf(buf + out, len - out, "\t%u - %s\n",
                                node->node_num, state);
        }
@@ -919,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 static int debug_state_open(struct inode *inode, struct file *file)
 {
        struct dlm_ctxt *dlm = inode->i_private;
-        struct debug_buffer *db = NULL;
+        char *buf = NULL;
-        db = debug_buffer_allocate();
+        buf = (char *) get_zeroed_page(GFP_NOFS);
-        if (!db)
+        if (!buf)
                goto bail;
-        db->len = debug_state_print(dlm, db);
+        i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
-        file->private_data = db;
+        file->private_data = buf;
        return 0;
 bail:
@@ -936,9 +890,9 @@ bail:
 static const struct file_operations debug_state_fops = {
        .open =         debug_state_open,
-        .release =      debug_buffer_release,
+        .release =      debug_release,
-        .read =         debug_buffer_read,
+        .read =         debug_read,
-        .llseek =       debug_buffer_llseek,
+        .llseek =       generic_file_llseek,
 };
 /* end  - debug state funcs */
@@ -1002,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
        struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
        if (dc) {
-                if (dc->debug_purgelist_dentry)
+                debugfs_remove(dc->debug_purgelist_dentry);
-                        debugfs_remove(dc->debug_purgelist_dentry);
+                debugfs_remove(dc->debug_mle_dentry);
-                if (dc->debug_mle_dentry)
+                debugfs_remove(dc->debug_lockres_dentry);
-                        debugfs_remove(dc->debug_mle_dentry);
+                debugfs_remove(dc->debug_state_dentry);
-                if (dc->debug_lockres_dentry)
-                        debugfs_remove(dc->debug_lockres_dentry);
-                if (dc->debug_state_dentry)
-                        debugfs_remove(dc->debug_state_dentry);
                dlm_debug_put(dc);
        }
 }
@@ -1040,8 +990,7 @@ bail:
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-        if (dlm->dlm_debugfs_subroot)
+        debugfs_remove(dlm->dlm_debugfs_subroot);
-                debugfs_remove(dlm->dlm_debugfs_subroot);
 }
 /* debugfs root */
@@ -1057,7 +1006,6 @@ int dlm_create_debugfs_root(void)
 void dlm_destroy_debugfs_root(void)
 {
-        if (dlm_debugfs_root)
+        debugfs_remove(dlm_debugfs_root);
-                debugfs_remove(dlm_debugfs_root);
 }
 #endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c7..1f27c4812d1a 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
        struct dentry *debug_purgelist_dentry;
 };
-struct debug_buffer {
-        int len;
-        char *buf;
-};
 struct debug_lockres {
        int dl_len;
        char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index cc2aaa96cfe5..7e38a072d720 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -460,8 +460,6 @@ redo_bucket:
                }
                cond_resched_lock(&dlm->spinlock);
                num += n;
-                mlog(0, "%s: touched %d lockreses in bucket %d "
-                     "(tot=%d)\n", dlm->name, n, i, num);
        }
        spin_unlock(&dlm->spinlock);
        wake_up(&dlm->dlm_thread_wq);
@@ -1661,8 +1659,8 @@ bail:
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
-        o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
+        o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
        o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
@@ -1674,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
        if (status)
                goto bail;
        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-        status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
+        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
        if (status)
                goto bail;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369961c4..7009292aac5a 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
                if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
                        return 0;
+                if (!dlm_lock_compatible(tmplock->ml.convert_type,
+                                         lock->ml.type))
+                        return 0;
        }
        return 1;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf33d9b..1d6d1d22c471 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
 void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                              struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
        if (__dlm_lockres_unused(res)){
                if (list_empty(&res->purge)) {
-                        mlog(0, "putting lockres %.*s:%p onto purge list\n",
+                        mlog(0, "%s: Adding res %.*s to purge list\n",
-                             res->lockname.len, res->lockname.name, res);
+                             dlm->name, res->lockname.len, res->lockname.name);
                        res->last_used = jiffies;
                        dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                        dlm->purge_count++;
                }
        } else if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
+                mlog(0, "%s: Removing res %.*s from purge list\n",
-                     res->lockname.len, res->lockname.name, res, res->owner);
+                     dlm->name, res->lockname.len, res->lockname.name);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                            struct dlm_lock_resource *res)
 {
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
        spin_lock(&dlm->spinlock);
        spin_lock(&res->spinlock);
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
        master = (res->owner == dlm->node_num);
+        mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
-        mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
+             res->lockname.len, res->lockname.name, master);
-             res->lockname.name, master);
        if (!master) {
                res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                /* clear our bit from the master's refmap, ignore errors */
                ret = dlm_drop_lockres_ref(dlm, res);
                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
+                             res->lockname.len, res->lockname.name, ret);
                        if (!dlm_is_host_down(ret))
                                BUG();
                }
-                mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
-                     dlm->name, res->lockname.len, res->lockname.name, ret);
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
        }
        if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s:%p from purgelist, "
+                mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
-                     "master = %d\n", res->lockname.len, res->lockname.name,
+                     dlm->name, res->lockname.len, res->lockname.name, master);
-                     res, master);
                list_del_init(&res->purge);
                dlm_lockres_put(res);
                dlm->purge_count--;
        }
        if (!__dlm_lockres_unused(res)) {
-                mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+                mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
                     dlm->name, res->lockname.len, res->lockname.name);
                __dlm_print_one_lock_resource(res);
                BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                unused = __dlm_lockres_unused(lockres);
                if (!unused ||
                    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
-                        mlog(0, "lockres %s:%.*s: is in use or "
+                        mlog(0, "%s: res %.*s is in use or being remastered, "
-                             "being remastered, used %d, state %d\n",
+                             "used %d, state %d\n", dlm->name,
-                             dlm->name, lockres->lockname.len,
+                             lockres->lockname.len, lockres->lockname.name,
-                             lockres->lockname.name, !unused, lockres->state);
+                             !unused, lockres->state);
                        list_move_tail(&dlm->purge_list, &lockres->purge);
                        spin_unlock(&lockres->spinlock);
                        continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
        struct list_head *head;
        int can_grant = 1;
-        //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
+        /*
-        //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
+         * Because this function is called with the lockres
-        //mlog(0, "shuffle res %.*s\n", res->lockname.len,
-        //        res->lockname.name);
-        /* because this function is called with the lockres
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
-         * basts right before queueing them all throughout */
+         * basts right before queueing them all throughout
+         */
        assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 converting:
        if (list_empty(&res->converting))
                goto blocked;
-        mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
+        mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
-             res->lockname.name);
+             res->lockname.len, res->lockname.name);
        target = list_entry(res->converting.next, struct dlm_lock, list);
        if (target->ml.convert_type == LKM_IVMODE) {
-                mlog(ML_ERROR, "%.*s: converting a lock with no "
+                mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
-                     "convert_type!\n", res->lockname.len, res->lockname.name);
+                     dlm->name, res->lockname.len, res->lockname.name);
                BUG();
        }
        head = &res->granted;
@@ -365,9 +356,12 @@ converting:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for converting lock: %.*s, have: %d, "
+                mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
-                     "granting: %d, node: %u\n", res->lockname.len,
+                     "%d => %d, node %u\n", dlm->name, res->lockname.len,
-                     res->lockname.name, target->ml.type,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+                     target->ml.type,
                     target->ml.convert_type, target->ml.node);
                target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
                spin_lock(&target->spinlock);
                BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
-                mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
+                mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
-                     "node: %u\n", res->lockname.len, res->lockname.name,
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
                     target->ml.type, target->ml.node);
-                // target->ml.type is already correct
+                /* target->ml.type is already correct */
                list_move_tail(&target->list, &res->granted);
                BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
 /* must have NO locks when calling this with res !=NULL * */
 void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        if (res) {
                spin_lock(&dlm->spinlock);
                spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
-        mlog_entry("dlm=%p, res=%p\n", dlm, res);
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
                        res->state |= DLM_LOCK_RES_DIRTY;
                }
        }
+        mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+             res->lockname.name);
 }
 /* Launch the NM thread for the mounted volume */
 int dlm_launch_thread(struct dlm_ctxt *dlm)
 {
-        mlog(0, "starting dlm thread...\n");
+        mlog(0, "Starting dlm_thread...\n");
        dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
        if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
 void dlm_complete_thread(struct dlm_ctxt *dlm)
 {
        if (dlm->dlm_thread_task) {
-                mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+                mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
                kthread_stop(dlm->dlm_thread_task);
                dlm->dlm_thread_task = NULL;
        }
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* get an extra ref on lock */
                dlm_lock_get(lock);
                res = lock->lockres;
-                mlog(0, "delivering an ast for this lockres\n");
+                mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+                     "node %u\n", dlm->name, res->lockname.len,
+                     res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     lock->ml.type, lock->ml.node);
                BUG_ON(!lock->ast_pending);
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another ast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->ast_list)) {
-                        mlog(0, "aha another ast got queued while "
+                        mlog(0, "%s: res %.*s, AST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the ast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->ast_pending = 0;
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                dlm_lock_put(lock);
                spin_unlock(&dlm->ast_lock);
-                mlog(0, "delivering a bast for this lockres "
+                mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
-                     "(blocked = %d\n", hi);
+                     "blocked %d, node %u\n",
+                     dlm->name, res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     hi, lock->ml.node);
                if (lock->ml.node != dlm->node_num) {
                        ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                /* possible that another bast was queued while
                 * we were delivering the last one */
                if (!list_empty(&lock->bast_list)) {
-                        mlog(0, "aha another bast got queued while "
+                        mlog(0, "%s: res %.*s, BAST queued while flushing last "
-                             "we were finishing the last one.  will "
+                             "one\n", dlm->name, res->lockname.len,
-                             "keep the bast_pending flag set.\n");
+                             res->lockname.name);
                } else
                        lock->bast_pending = 0;
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
-                                mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
+                                mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
-                                     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
+                                     " dirty %d\n", dlm->name,
-                                     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
-                                     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
+                                     !!(res->state & DLM_LOCK_RES_MIGRATING),
-                                     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+                                     !!(res->state & DLM_LOCK_RES_RECOVERING),
+                                     !!(res->state & DLM_LOCK_RES_DIRTY));
                        }
                        BUG_ON(res->owner != dlm->node_num);
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
                                spin_unlock(&dlm->ast_lock);
-                                mlog(0, "delaying list shuffling for in-"
+                                mlog(0, "%s: res %.*s, inprogress, delay list "
-                                     "progress lockres %.*s, state=%d\n",
+                                     "shuffle, state %d\n", dlm->name,
                                     res->lockname.len, res->lockname.name,
                                     res->state);
                                delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
                         * spinlock and do NOT have the dlm lock.
                         * safe to reserve/queue asts and run the lists. */
-                        mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
-                             "res=%.*s\n", dlm->name,
-                             res->lockname.len, res->lockname.name);
                        /* called while holding lockres lock */
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
                        /* unlikely, but we may need to give time to
                         * other tasks */
                        if (!--n) {
-                                mlog(0, "throttling dlm_thread\n");
+                                mlog(0, "%s: Throttling dlm thread\n",
+                                     dlm->name);
                                break;
                        }
                }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d14cad6e2e41..30c523144452 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1017,8 +1017,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                 * An error return must mean that no cluster locks
                 * were held on function exit.
                 */
-                if (oi1->ip_blkno != oi2->ip_blkno)
+                if (oi1->ip_blkno != oi2->ip_blkno) {
                        ocfs2_inode_unlock(inode2, 1);
+                        brelse(*bh2);
+                        *bh2 = NULL;
+                }
                if (status != -ENOENT)
                        mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 70dd3b1798f1..51cd6898e7f1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -420,6 +420,11 @@ struct ocfs2_super
        struct inode                    *osb_tl_inode;
        struct buffer_head              *osb_tl_bh;
        struct delayed_work             osb_truncate_log_wq;
+        /*
+         * How many clusters in our truncate log.
+         * It must be protected by osb_tl_inode->i_mutex.
+         */
+        unsigned int truncated_clusters;
        struct ocfs2_node_map           osb_recovering_orphan_dirs;
        unsigned int                    *osb_orphan_wipes;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0fed41e6efcd..84becd3e4772 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 void __quota_error(struct super_block *sb, const char *func,
-                  const char *fmt, ...)
+                   const char *fmt, ...)
 {
-        va_list args;
        if (printk_ratelimit()) {
+                va_list args;
+                struct va_format vaf;
                va_start(args, fmt);
-                printk(KERN_ERR "Quota error (device %s): %s: ",
-                       sb->s_id, func);
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
-                printk("\n");
+                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
+                       sb->s_id, func, &vaf);
                va_end(args);
        }
 }
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabcc..e41c1becf096 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                return -ENOMEM;
        ret = read_blk(info, *blk, buf);
        if (ret < 0) {
-                quota_error(dquot->dq_sb, "Can't read quota data "
+                quota_error(dquot->dq_sb, "Can't read quota data block %u",
-                            "block %u", blk);
+                            *blk);
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                } else {
                        ret = write_blk(info, *blk, buf);
                        if (ret < 0)
-                                quota_error(dquot->dq_sb, "Can't write quota "
+                                quota_error(dquot->dq_sb,
-                                            "tree block %u", blk);
+                                            "Can't write quota tree block %u",
+                                            *blk);
                }
        }
 out_buf:
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 442f34ff1af8..c8769dc222d8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -165,10 +165,7 @@ int sysfs_merge_group(struct kobject *kobj,
        struct attribute *const *attr;
        int i;
-        if (grp)
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-                dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-        else
-                dir_sd = sysfs_get(kobj->sd);
        if (!dir_sd)
                return -ENOENT;
@@ -195,10 +192,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
        struct sysfs_dirent *dir_sd;
        struct attribute *const *attr;
-        if (grp)
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-                dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
-        else
-                dir_sd = sysfs_get(kobj->sd);
        if (dir_sd) {
                for (attr = grp->attrs; *attr; ++attr)
                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 30ac27345586..0a12eb89cd32 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sysfs.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ffaaa816bfba..3d28af31d863 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
 */
 #include <linux/lockdep.h>
+#include <linux/kobject_ns.h>
 #include <linux/fs.h>
 struct sysfs_open_dirent;
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index f8def3c8ea4c..0e0e99bd6bce 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,5 @@
 config UDF_FS
        tristate "UDF file system support"
-        depends on BKL # needs serious work to remove
        select CRC_ITU_T
        help
          This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b608efaa4cee..306ee39ef2c3 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                udf_debug("bit %ld already set\n", bit + i);
                                udf_debug("byte=%2x\n",
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
-                        } else {
-                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
+                udf_add_free_space(sb, sbi->s_partition, count);
                mark_buffer_dirty(bh);
                if (overflow) {
                        block += count;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 51552bf50225..eb8bfe2b89a5 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode *dir = filp->f_path.dentry->d_inode;
        int result;
-        lock_kernel();
        if (filp->f_pos == 0) {
                if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-                        unlock_kernel();
                        return 0;
                }
                filp->f_pos++;
        }
        result = do_udf_readdir(dir, filp, filldir, dirent);
-        unlock_kernel();
        return result;
 }
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 66b9e7e7e4c5..89c78486cbbe 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -32,7 +32,6 @@
 #include <linux/string.h> /* memset */
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
@@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        size_t count = iocb->ki_left;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                if (file->f_flags & O_APPEND)
                        pos = inode->i_size;
@@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                        udf_expand_file_adinicb(inode, pos + count, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
+                                up_write(&iinfo->i_data_sem);
                                return err;
                        }
                } else {
@@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                iinfo->i_lenAlloc = inode->i_size;
                }
        }
+        up_write(&iinfo->i_data_sem);
        retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
        if (retval > 0)
@@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        long old_block, new_block;
        int result = -EINVAL;
-        lock_kernel();
        if (file_permission(filp, MAY_READ) != 0) {
                udf_debug("no permission to access inode %lu\n", inode->i_ino);
                result = -EPERM;
@@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        }
 out:
-        unlock_kernel();
        return result;
 }
@@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->f_mode & FMODE_WRITE) {
                mutex_lock(&inode->i_mutex);
-                lock_kernel();
+                down_write(&UDF_I(inode)->i_data_sem);
                udf_discard_prealloc(inode);
                udf_truncate_tail_extent(inode);
-                unlock_kernel();
+                up_write(&UDF_I(inode)->i_data_sem);
                mutex_unlock(&inode->i_mutex);
        }
        return 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 75d9304d0dc3..6fb7e0adcda0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                return NULL;
        }
-        mutex_lock(&sbi->s_alloc_mutex);
        if (sbi->s_lvid_bh) {
-                struct logicalVolIntegrityDesc *lvid =
+                struct logicalVolIntegrityDescImpUse *lvidiu;
-                        (struct logicalVolIntegrityDesc *)
-                        sbi->s_lvid_bh->b_data;
+                iinfo->i_unique = lvid_get_unique_id(sb);
-                struct logicalVolIntegrityDescImpUse *lvidiu =
+                mutex_lock(&sbi->s_alloc_mutex);
-                                                        udf_sb_lvidiu(sbi);
+                lvidiu = udf_sb_lvidiu(sbi);
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
                if (S_ISDIR(mode))
                        le32_add_cpu(&lvidiu->numDirs, 1);
                else
                        le32_add_cpu(&lvidiu->numFiles, 1);
-                iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
                udf_updated_lvid(sb);
+                mutex_unlock(&sbi->s_alloc_mutex);
        }
-        mutex_unlock(&sbi->s_alloc_mutex);
        inode_init_owner(inode, dir, mode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fc48f37aa2dd..c6a2e782b97b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -31,7 +31,6 @@
 #include "udfdecl.h"
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
@@ -51,6 +50,7 @@ MODULE_LICENSE("GPL");
 static mode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
+static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
                                        sector_t *, int *);
@@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode)
                want_delete = 1;
                inode->i_size = 0;
                udf_truncate(inode);
-                lock_kernel();
                udf_update_inode(inode, IS_SYNC(inode));
-                unlock_kernel();
        }
        invalidate_inode_buffers(inode);
        end_writeback(inode);
@@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode)
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
        if (want_delete) {
-                lock_kernel();
                udf_free_inode(inode);
-                unlock_kernel();
        }
 }
@@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block,
        err = -EIO;
        new = 0;
        bh = NULL;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        if (block == iinfo->i_next_alloc_block + 1) {
                iinfo->i_next_alloc_block++;
                iinfo->i_next_alloc_goal++;
@@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
        map_bh(bh_result, inode->i_sb, phys);
 abort:
-        unlock_kernel();
+        up_write(&iinfo->i_data_sem);
        return err;
 }
@@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode)
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        lock_kernel();
        iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                down_write(&iinfo->i_data_sem);
                if (inode->i_sb->s_blocksize <
                                (udf_file_entry_alloc_offset(inode) +
                                 inode->i_size)) {
                        udf_expand_file_adinicb(inode, inode->i_size, &err);
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                                inode->i_size = iinfo->i_lenAlloc;
-                                unlock_kernel();
+                                up_write(&iinfo->i_data_sem);
                                return;
                        } else
                                udf_truncate_extents(inode);
@@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode)
                                offset - udf_file_entry_alloc_offset(inode));
                        iinfo->i_lenAlloc = inode->i_size;
                }
+                up_write(&iinfo->i_data_sem);
        } else {
                block_truncate_page(inode->i_mapping, inode->i_size,
                                    udf_get_block);
+                down_write(&iinfo->i_data_sem);
                udf_truncate_extents(inode);
+                up_write(&iinfo->i_data_sem);
        }
        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
@@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode)
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
-        unlock_kernel();
 }
 static void __udf_read_inode(struct inode *inode)
@@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                return;
        }
+        read_lock(&sbi->s_cred_lock);
        inode->i_uid = le32_to_cpu(fe->uid);
        if (inode->i_uid == -1 ||
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
@@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
                inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
-        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
-        if (!inode->i_nlink)
-                inode->i_nlink = 1;
-        inode->i_size = le64_to_cpu(fe->informationLength);
-        iinfo->i_lenExtents = inode->i_size;
        if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
                        sbi->s_fmode != UDF_INVALID_MODE)
                inode->i_mode = sbi->s_fmode;
@@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        else
                inode->i_mode = udf_convert_permissions(fe);
        inode->i_mode &= ~sbi->s_umask;
+        read_unlock(&sbi->s_cred_lock);
+        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
+        if (!inode->i_nlink)
+                inode->i_nlink = 1;
+        inode->i_size = le64_to_cpu(fe->informationLength);
+        iinfo->i_lenExtents = inode->i_size;
        if (iinfo->i_efe == 0) {
                inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
 int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        int ret;
+        return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        lock_kernel();
-        ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
-        return ret;
 }
-int udf_sync_inode(struct inode *inode)
+static int udf_sync_inode(struct inode *inode)
 {
        return udf_update_inode(inode, 1);
 }
@@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        struct extent_position epos = {};
        int ret;
-        lock_kernel();
+        down_read(&UDF_I(inode)->i_data_sem);
        if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
                                                (EXT_RECORDED_ALLOCATED >> 30))
@@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block)
        else
                ret = 0;
-        unlock_kernel();
+        up_read(&UDF_I(inode)->i_data_sem);
        brelse(epos.bh);
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6d8dc02baebb..2be0f9eb86d2 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/crc-itu-t.h>
@@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                }
                if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
-                    isdotdot) {
+                    isdotdot)
-                        brelse(epos.bh);
+                        goto out_ok;
-                        return fi;
-                }
                if (!lfi)
                        continue;
@@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
        if (dentry->d_name.len > UDF_NAME_LEN - 2)
                return ERR_PTR(-ENAMETOOLONG);
-        lock_kernel();
 #ifdef UDF_RECOVERY
        /* temporary shorthand for specifying files by inode number */
        if (!strncmp(dentry->d_name.name, ".B=", 3)) {
@@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                };
                inode = udf_iget(dir->i_sb, lb);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        } else
@@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                loc = lelb_to_cpu(cfi.icb.extLocation);
                inode = udf_iget(dir->i_sb, &loc);
                if (!inode) {
-                        unlock_kernel();
                        return ERR_PTR(-EACCES);
                }
        }
-        unlock_kernel();
        return d_splice_alias(inode, dentry);
 }
@@ -476,15 +469,19 @@ add:
                                f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
                if (!fibh->ebh)
                        goto out_err;
+                /* Extents could have been merged, invalidate our position */
+                brelse(epos.bh);
+                epos.bh = NULL;
+                epos.block = dinfo->i_location;
+                epos.offset = udf_file_entry_alloc_offset(dir);
                if (!fibh->soffset) {
-                        if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+                        /* Find the freshly allocated block */
-                            (EXT_RECORDED_ALLOCATED >> 30)) {
+                        while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
-                                block = eloc.logicalBlockNum + ((elen - 1) >>
+                                (EXT_RECORDED_ALLOCATED >> 30))
+                                ;
+                        block = eloc.logicalBlockNum + ((elen - 1) >>
                                        dir->i_sb->s_blocksize_bits);
-                        } else
-                                block++;
                        brelse(fibh->sbh);
                        fibh->sbh = fibh->ebh;
                        fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
@@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
-        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
-                unlock_kernel();
                return err;
        }
@@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
-        unlock_kernel();
        d_instantiate(dentry, inode);
        return 0;
@@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
        if (!inode)
@@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_nlink--;
                mark_inode_dirty(inode);
                iput(inode);
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
                goto out;
@@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -826,7 +813,6 @@ end_rmdir:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct kernel_lb_addr tloc;
        retval = -ENOENT;
-        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
                goto out;
@@ -870,7 +855,6 @@ end_unlink:
        brelse(fibh.sbh);
 out:
-        unlock_kernel();
        return retval;
 }
@@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        int block;
        unsigned char *name = NULL;
        int namelen;
-        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
+        struct super_block *sb = dir->i_sb;
-        lock_kernel();
        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
                goto out;
+        iinfo = UDF_I(inode);
+        down_write(&iinfo->i_data_sem);
        name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
        if (!name) {
                err = -ENOMEM;
                goto out_no_entry;
        }
-        iinfo = UDF_I(inode);
        inode->i_data.a_ops = &udf_symlink_aops;
        inode->i_op = &udf_symlink_inode_operations;
@@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                struct kernel_lb_addr eloc;
                uint32_t bsize;
-                block = udf_new_block(inode->i_sb, inode,
+                block = udf_new_block(sb, inode,
                                iinfo->i_location.partitionReferenceNum,
                                iinfo->i_location.logicalBlockNum, &err);
                if (!block)
@@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                eloc.logicalBlockNum = block;
                eloc.partitionReferenceNum =
                                iinfo->i_location.partitionReferenceNum;
-                bsize = inode->i_sb->s_blocksize;
+                bsize = sb->s_blocksize;
                iinfo->i_lenExtents = bsize;
                udf_add_aext(inode, &epos, &eloc, bsize, 0);
                brelse(epos.bh);
-                block = udf_get_pblock(inode->i_sb, block,
+                block = udf_get_pblock(sb, block,
                                iinfo->i_location.partitionReferenceNum,
                                0);
-                epos.bh = udf_tgetblk(inode->i_sb, block);
+                epos.bh = udf_tgetblk(sb, block);
                lock_buffer(epos.bh);
-                memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
+                memset(epos.bh->b_data, 0x00, bsize);
                set_buffer_uptodate(epos.bh);
                unlock_buffer(epos.bh);
                mark_buffer_dirty_inode(epos.bh, inode);
@@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        } else
                ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
-        eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode);
+        eoffset = sb->s_blocksize - udf_ext0_offset(inode);
        pc = (struct pathComponent *)ea;
        if (*symname == '/') {
@@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                }
                if (pc->componentType == 5) {
-                        namelen = udf_put_filename(inode->i_sb, compstart, name,
+                        namelen = udf_put_filename(sb, compstart, name,
                                                   symname - compstart);
                        if (!namelen)
                                goto out_no_entry;
@@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi)
                goto out_no_entry;
-        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
+        cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                lvid->logicalVolContentsUse;
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                mark_inode_dirty(dir);
+        up_write(&iinfo->i_data_sem);
        if (fibh.sbh != fibh.ebh)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
@@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 out:
        kfree(name);
-        unlock_kernel();
        return err;
 out_no_entry:
+        up_write(&iinfo->i_data_sem);
        inode_dec_link_count(inode);
        iput(inode);
        goto out;
@@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        struct udf_fileident_bh fibh;
        struct fileIdentDesc cfi, *fi;
        int err;
-        struct buffer_head *bh;
-        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
-                unlock_kernel();
                return -EMLINK;
        }
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
-                unlock_kernel();
                return err;
        }
        cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
-        bh = UDF_SB(inode->i_sb)->s_lvid_bh;
+        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-        if (bh) {
-                struct logicalVolIntegrityDesc *lvid =
-                                (struct logicalVolIntegrityDesc *)bh->b_data;
-                struct logicalVolHeaderDesc *lvhd;
-                uint64_t uniqueID;
-                lvhd = (struct logicalVolHeaderDesc *)
-                                (lvid->logicalVolContentsUse);
-                uniqueID = le64_to_cpu(lvhd->uniqueID);
                *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-                        cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL);
+                        cpu_to_le32(lvid_get_unique_id(inode->i_sb));
-                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
-                        uniqueID += 16;
-                lvhd->uniqueID = cpu_to_le64(uniqueID);
-                mark_buffer_dirty(bh);
        }
        udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
        if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
@@ -1103,7 +1060,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        mark_inode_dirty(inode);
        ihold(inode);
        d_instantiate(dentry, inode);
-        unlock_kernel();
        return 0;
 }
@@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
                if (ofibh.sbh != ofibh.ebh)
@@ -1248,7 +1203,6 @@ end_rename:
                        brelse(nfibh.ebh);
                brelse(nfibh.sbh);
        }
-        unlock_kernel();
        return retval;
 }
@@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child)
        struct fileIdentDesc cfi;
        struct udf_fileident_bh fibh;
-        lock_kernel();
        if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
                goto out_unlock;
@@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child)
        inode = udf_iget(child->d_inode->i_sb, &tloc);
        if (!inode)
                goto out_unlock;
-        unlock_kernel();
        return d_obtain_alias(inode);
 out_unlock:
-        unlock_kernel();
        return ERR_PTR(-EACCES);
 }
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 745eb209be0c..a71090ea0e07 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/mutex.h>
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
                        uint16_t partition, uint32_t offset)
@@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
        struct udf_sb_info *sbi = UDF_SB(sb);
        u16 reallocationTableLen;
        struct buffer_head *bh;
+        int ret = 0;
+        mutex_lock(&sbi->s_alloc_mutex);
        for (i = 0; i < sbi->s_partitions; i++) {
                struct udf_part_map *map = &sbi->s_partmaps[i];
                if (old_block > map->s_partition_root &&
@@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                        break;
                                }
-                        if (!st)
+                        if (!st) {
-                                return 1;
+                                ret = 1;
+                                goto out;
+                        }
                        reallocationTableLen =
                                        le16_to_cpu(st->reallocationTableLen);
@@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc == packet) {
                                        *new_block = le32_to_cpu(
                                                        entry->mappedLocation) +
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
-                                        return 0;
+                                        ret = 0;
+                                        goto out;
                                } else if (origLoc > packet)
                                        break;
                        }
@@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
                                              st->mapEntry[k].mappedLocation) +
                                        ((old_block - map->s_partition_root) &
                                         (sdata->s_packet_len - 1));
-                                return 0;
+                                ret = 0;
+                                goto out;
                        }
-                        return 1;
+                        ret = 1;
+                        goto out;
                } /* if old_block */
        }
        if (i == sbi->s_partitions) {
                /* outside of partitions */
                /* for now, fail =) */
-                return 1;
+                ret = 1;
        }
-        return 0;
+out:
+        mutex_unlock(&sbi->s_alloc_mutex);
+        return ret;
 }
 static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b539d53320fb..7b27b063ff6d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
 #include <linux/stat.h>
 #include <linux/cdrom.h>
 #include <linux/nls.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/vmalloc.h>
@@ -135,6 +134,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
        ei->i_next_alloc_block = 0;
        ei->i_next_alloc_goal = 0;
        ei->i_strat4096 = 0;
+        init_rwsem(&ei->i_data_sem);
        return &ei->vfs_inode;
 }
@@ -574,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
-        lock_kernel();
+        write_lock(&sbi->s_cred_lock);
        sbi->s_flags = uopt.flags;
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
        sbi->s_umask = uopt.umask;
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
+        write_unlock(&sbi->s_cred_lock);
        if (sbi->s_lvid_bh) {
                int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -597,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                udf_open_lvid(sb);
 out_unlock:
-        unlock_kernel();
        return error;
 }
@@ -966,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                (sizeof(struct buffer_head *) * nr_groups);
        if (size <= PAGE_SIZE)
-                bitmap = kmalloc(size, GFP_KERNEL);
+                bitmap = kzalloc(size, GFP_KERNEL);
        else
-                bitmap = vmalloc(size); /* TODO: get rid of vmalloc */
+                bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
        if (bitmap == NULL) {
                udf_error(sb, __func__,
@@ -977,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
                return NULL;
        }
-        memset(bitmap, 0x00, size);
        bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
        bitmap->s_nr_groups = nr_groups;
        return bitmap;
@@ -1781,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
@@ -1797,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
 }
 static void udf_close_lvid(struct super_block *sb)
@@ -1809,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb)
        if (!bh)
                return;
+        mutex_lock(&sbi->s_alloc_mutex);
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1829,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb)
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
+        mutex_unlock(&sbi->s_alloc_mutex);
+}
+u64 lvid_get_unique_id(struct super_block *sb)
+{
+        struct buffer_head *bh;
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        struct logicalVolIntegrityDesc *lvid;
+        struct logicalVolHeaderDesc *lvhd;
+        u64 uniqueID;
+        u64 ret;
+        bh = sbi->s_lvid_bh;
+        if (!bh)
+                return 0;
+        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+        lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
+        mutex_lock(&sbi->s_alloc_mutex);
+        ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
+        if (!(++uniqueID & 0xFFFFFFFF))
+                uniqueID += 16;
+        lvhd->uniqueID = cpu_to_le64(uniqueID);
+        mutex_unlock(&sbi->s_alloc_mutex);
+        mark_buffer_dirty(bh);
+        return ret;
 }
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1886,8 +1917,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        struct kernel_lb_addr rootdir, fileset;
        struct udf_sb_info *sbi;
-        lock_kernel();
        uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
        uopt.uid = -1;
        uopt.gid = -1;
@@ -1896,10 +1925,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        uopt.dmode = UDF_INVALID_MODE;
        sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
-        if (!sbi) {
+        if (!sbi)
-                unlock_kernel();
                return -ENOMEM;
-        }
        sb->s_fs_info = sbi;
@@ -1936,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sbi->s_fmode = uopt.fmode;
        sbi->s_dmode = uopt.dmode;
        sbi->s_nls_map = uopt.nls_map;
+        rwlock_init(&sbi->s_cred_lock);
        if (uopt.session == 0xFFFFFFFF)
                sbi->s_session = udf_get_last_session(sb);
@@ -2045,7 +2073,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                goto error_out;
        }
        sb->s_maxbytes = MAX_LFS_FILESIZE;
-        unlock_kernel();
        return 0;
 error_out:
@@ -2066,7 +2093,6 @@ error_out:
        kfree(sbi);
        sb->s_fs_info = NULL;
-        unlock_kernel();
        return -EINVAL;
 }
@@ -2105,8 +2131,6 @@ static void udf_put_super(struct super_block *sb)
        sbi = UDF_SB(sb);
-        lock_kernel();
        if (sbi->s_vat_inode)
                iput(sbi->s_vat_inode);
        if (sbi->s_partitions)
@@ -2122,8 +2146,6 @@ static void udf_put_super(struct super_block *sb)
        kfree(sbi->s_partmaps);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int udf_sync_fs(struct super_block *sb, int wait)
@@ -2186,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        uint16_t ident;
        struct spaceBitmapDesc *bm;
-        lock_kernel();
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
        bh = udf_read_ptagged(sb, &loc, 0, &ident);
@@ -2224,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
                }
        }
        brelse(bh);
 out:
-        unlock_kernel();
        return accum;
 }
@@ -2240,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
        int8_t etype;
        struct extent_position epos;
-        lock_kernel();
+        mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
        epos.block = UDF_I(table)->i_location;
        epos.offset = sizeof(struct unallocSpaceEntry);
        epos.bh = NULL;
@@ -2250,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
                accum += (elen >> table->i_sb->s_blocksize_bits);
        brelse(epos.bh);
+        mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
-        unlock_kernel();
        return accum;
 }
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 16064787d2b7..b1d4488b0f14 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
@@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        int err = -EIO;
        unsigned char *p = kmap(page);
        struct udf_inode_info *iinfo;
+        uint32_t pos;
-        lock_kernel();
        iinfo = UDF_I(inode);
+        pos = udf_block_map(inode, 0);
+        down_read(&iinfo->i_data_sem);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
        } else {
-                bh = sb_bread(inode->i_sb, udf_block_map(inode, 0));
+                bh = sb_bread(inode->i_sb, pos);
                if (!bh)
                        goto out;
@@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
        brelse(bh);
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 out:
-        unlock_kernel();
+        up_read(&iinfo->i_data_sem);
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index e58d1de41073..d1bd31ea724e 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,18 @@
 #ifndef _UDF_I_H
 #define _UDF_I_H
+/*
+ * The i_data_sem and i_mutex serve for protection of allocation information
+ * of a regular files and symlinks. This includes all extents belonging to
+ * the file/symlink, a fact whether data are in-inode or in external data
+ * blocks, preallocation, goal block information... When extents are read,
+ * i_mutex or i_data_sem must be held (for reading is enough in case of
+ * i_data_sem). When extents are changed, i_data_sem must be held for writing
+ * and also i_mutex must be held.
+ *
+ * For directories i_mutex is used for all the necessary protection.
+ */
 struct udf_inode_info {
        struct timespec         i_crtime;
        /* Physical address of inode */
@@ -21,6 +33,7 @@ struct udf_inode_info {
                struct long_ad          *i_lad;
                __u8            *i_data;
        } i_ext;
+        struct rw_semaphore     i_data_sem;
        struct inode vfs_inode;
 };
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d113b72c2768..4858c191242b 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -2,6 +2,7 @@
 #define __LINUX_UDF_SB_H
 #include <linux/mutex.h>
+#include <linux/bitops.h>
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC                 0x15013346
@@ -128,6 +129,8 @@ struct udf_sb_info {
        uid_t                   s_uid;
        mode_t                  s_fmode;
        mode_t                  s_dmode;
+        /* Lock protecting consistency of above permission settings */
+        rwlock_t                s_cred_lock;
        /* Root Info */
        struct timespec         s_record_time;
@@ -139,7 +142,7 @@ struct udf_sb_info {
        __u16                   s_udfrev;
        /* Miscellaneous flags */
-        __u32                   s_flags;
+        unsigned long           s_flags;
        /* Encoding info */
        struct nls_table        *s_nls_map;
@@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
 int udf_compute_nr_groups(struct super_block *sb, u32 partition);
-#define UDF_QUERY_FLAG(X,Y)                     ( UDF_SB(X)->s_flags & ( 1 << (Y) ) )
+static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
-#define UDF_SET_FLAG(X,Y)                       ( UDF_SB(X)->s_flags |= ( 1 << (Y) ) )
+{
-#define UDF_CLEAR_FLAG(X,Y)                     ( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) )
+        return test_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
+{
+        set_bit(flag, &UDF_SB(sb)->s_flags);
+}
+static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
+{
+        clear_bit(flag, &UDF_SB(sb)->s_flags);
+}
 #endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 6995ab1f4305..eba48209f9f3 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -111,6 +111,8 @@ struct extent_position {
 };
 /* super.c */
+__attribute__((format(printf, 3, 4)))
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
 static inline void udf_updated_lvid(struct super_block *sb)
 {
@@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb)
        sb->s_dirt = 1;
        UDF_SB(sb)->s_lvid_dirty = 1;
 }
+extern u64 lvid_get_unique_id(struct super_block *sb);
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-typedef struct sv_s {
-        wait_queue_head_t waiters;
-} sv_t;
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(TASK_UNINTERRUPTIBLE);
-        spin_unlock(lock);
-        schedule();
-        remove_wait_queue(&sv->waiters, &wait);
-}
-#define sv_init(sv,flag,name) \
-        init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-        /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock)
-#define sv_signal(sv) \
-        wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-        wake_up_all(&(sv)->waiters)
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f61223ed6..ec7bbb5645b6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-        IO_READ,        /* mapping for a read */
-        IO_DELAY,       /* mapping covers delalloc region */
-        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-        IO_NEW          /* just allocated */
-};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -244,10 +232,8 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IO_READ) {
+        error = xfs_setfilesize(ioend);
-                error = xfs_setfilesize(ioend);
+        ASSERT(!error || error == EAGAIN);
-                ASSERT(!error || error == EAGAIN);
-        }
        /*
         * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
-        ssize_t                 count,
        struct xfs_bmbt_irec    *imap,
-        int                     flags)
+        int                     type,
+        int                     nonblocking)
 {
-        int                     nmaps = 1;
+        struct xfs_inode        *ip = XFS_I(inode);
-        int                     new = 0;
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 count = 1 << inode->i_blkbits;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
+        if (type == IO_UNWRITTEN)
+                bmapi_flags |= XFS_BMAPI_IGSTATE;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                if (nonblocking)
+                        return -XFS_ERROR(EAGAIN);
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+        }
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               (ip->i_df.if_flags & XFS_IFEXTENTS));
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + count > mp->m_maxioffset)
+                count = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                          bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (error)
+                return -XFS_ERROR(error);
+        if (type == IO_DELALLOC &&
+            (!nimaps || isnullstartblock(imap->br_startblock))) {
+                error = xfs_iomap_write_allocate(ip, offset, count, imap);
+                if (!error)
+                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+                return -XFS_ERROR(error);
+        }
+#ifdef DEBUG
+        if (type == IO_UNWRITTEN) {
+                ASSERT(nimaps);
+                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+        }
+#endif
+        if (nimaps)
+                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+        return 0;
 }
 STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
                   WRITE_SYNC_PLUG : WRITE, bio);
-        ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-        bio_put(bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
        struct buffer_head      *bh)
 {
-        struct bio              *bio;
        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
-        do {
-                bio = bio_alloc(GFP_NOIO, nvecs);
-                nvecs >>= 1;
-        } while (!bio);
        ASSERT(bio->bi_private == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio_get(bio);
        return bio;
 }
@@ -470,9 +497,8 @@ xfs_submit_ioend(
        /* Pass 1 - start writeback */
        do {
                next = ioend->io_list;
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
                        xfs_start_buffer_writeback(bh);
-                }
        } while ((ioend = next) != NULL);
        /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        lock_buffer(bh);
        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
 }
 /*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-        struct page             *page,
-        unsigned int            pg_offset)
-{
-        struct buffer_head      *bh, *head;
-        int                     ret = 0;
-        if (PageWriteback(page))
-                return 0;
-        if (!PageDirty(page))
-                return 0;
-        if (!page->mapping)
-                return 0;
-        if (!page_has_buffers(page))
-                return 0;
-        bh = head = page_buffers(page);
-        do {
-                if (!buffer_uptodate(bh))
-                        break;
-                if (!buffer_mapped(bh))
-                        break;
-                ret += bh->b_size;
-                if (ret >= pg_offset)
-                        break;
-        } while ((bh = bh->b_this_page) != head);
-        return ret;
-}
-STATIC size_t
-xfs_probe_cluster(
-        struct inode            *inode,
-        struct page             *startpage,
-        struct buffer_head      *bh,
-        struct buffer_head      *head)
-{
-        struct pagevec          pvec;
-        pgoff_t                 tindex, tlast, tloff;
-        size_t                  total = 0;
-        int                     done = 0, i;
-        /* First sum forwards in this page */
-        do {
-                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-                        return total;
-                total += bh->b_size;
-        } while ((bh = bh->b_this_page) != head);
-        /* if we reached the end of the page, sum forwards in following pages */
-        tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-        tindex = startpage->index + 1;
-        /* Prune this back to avoid pathological behavior */
-        tloff = min(tlast, startpage->index + 64);
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tloff) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        struct page *page = pvec.pages[i];
-                        size_t pg_offset, pg_len = 0;
-                        if (tindex == tlast) {
-                                pg_offset =
-                                    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-                                if (!pg_offset) {
-                                        done = 1;
-                                        break;
-                                }
-                        } else
-                                pg_offset = PAGE_CACHE_SIZE;
-                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset);
-                                unlock_page(page);
-                        }
-                        if (!pg_len) {
-                                done = 1;
-                                break;
-                        }
-                        total += pg_len;
-                        tindex++;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-        return total;
-}
-/*
 * Test if a given page is suitable for writing as part of an unwritten
 * or delayed allocate extent.
 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
                        if (buffer_unwritten(bh))
                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IO_DELAY);
+                                acceptable = (type == IO_DELALLOC);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IO_NEW);
+                                acceptable = (type == IO_OVERWRITE);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
        loff_t                  tindex,
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
+        struct writeback_control *wbc)
-        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
                        continue;
                }
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
+                if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                    buffer_mapped(bh)) {
                        if (buffer_unwritten(bh))
                                type = IO_UNWRITTEN;
+                        else if (buffer_delay(bh))
+                                type = IO_DELALLOC;
                        else
-                                type = IO_DELAY;
+                                type = IO_OVERWRITE;
                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                        lock_buffer(bh);
-                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+                        if (type != IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_map_at_offset(inode, bh, imap, offset);
                        xfs_add_to_ioend(inode, bh, offset, type,
                                         ioendp, done);
                        page_dirty--;
                        count++;
                } else {
-                        type = IO_NEW;
+                        done = 1;
-                        if (buffer_mapped(bh) && all_bh) {
-                                lock_buffer(bh);
-                                xfs_add_to_ioend(inode, bh, offset,
-                                                type, ioendp, done);
-                                count++;
-                                page_dirty--;
-                        } else {
-                                done = 1;
-                        }
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
@@ -876,7 +790,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     all_bh,
        pgoff_t                 tlast)
 {
        struct pagevec          pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc, all_bh);
+                                        imap, ioendp, wbc);
                        if (done)
                                break;
                }
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        if (!xfs_is_delayed_page(page, IO_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELALLOC))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
-        ssize_t                 size, len;
+        ssize_t                 len;
-        int                     flags, err, imap_valid = 0, uptodate = 1;
+        int                     err, imap_valid = 0, uptodate = 1;
        int                     count = 0;
-        int                     all_bh = 0;
+        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0);
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
        bh = head = page_buffers(page);
        offset = page_offset(page);
-        flags = BMAPI_READ;
+        type = IO_OVERWRITE;
-        type = IO_NEW;
+        if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+                nonblocking = 1;
        do {
+                int new_ioend = 0;
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
                        continue;
                }
-                if (imap_valid)
+                if (buffer_unwritten(bh)) {
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                        if (type != IO_UNWRITTEN) {
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                        int new_ioend = 0;
-                        /*
-                         * Make sure we don't use a read-only iomap
-                         */
-                        if (flags == BMAPI_READ)
-                                imap_valid = 0;
-                        if (buffer_unwritten(bh)) {
                                type = IO_UNWRITTEN;
-                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
+                                imap_valid = 0;
-                        } else if (buffer_delay(bh)) {
-                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE)
-                                        flags |= BMAPI_TRYLOCK;
-                        }
-                        if (!imap_valid) {
-                                /*
-                                 * If we didn't have a valid mapping then we
-                                 * need to ensure that we put the new mapping
-                                 * in a new ioend structure. This needs to be
-                                 * done to ensure that the ioends correctly
-                                 * reflect the block mappings at io completion
-                                 * for unwritten extent conversion.
-                                 */
-                                new_ioend = 1;
-                                err = xfs_map_blocks(inode, offset, len,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
-                        if (imap_valid) {
+                } else if (buffer_delay(bh)) {
-                                xfs_map_at_offset(inode, bh, &imap, offset);
+                        if (type != IO_DELALLOC) {
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                type = IO_DELALLOC;
-                                                 &ioend, new_ioend);
+                                imap_valid = 0;
-                                count++;
                        }
                } else if (buffer_uptodate(bh)) {
-                        /*
+                        if (type != IO_OVERWRITE) {
-                         * we got here because the buffer is already mapped.
+                                type = IO_OVERWRITE;
-                         * That means it must already have extents allocated
+                                imap_valid = 0;
-                         * underneath it. Map the extent by reading it.
-                         */
-                        if (!imap_valid || flags != BMAPI_READ) {
-                                flags = BMAPI_READ;
-                                size = xfs_probe_cluster(inode, page, bh, head);
-                                err = xfs_map_blocks(inode, offset, size,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
+                } else {
+                        if (PageUptodate(page)) {
+                                ASSERT(buffer_mapped(bh));
+                                imap_valid = 0;
+                        }
+                        continue;
+                }
+                if (imap_valid)
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                if (!imap_valid) {
                        /*
-                         * We set the type to IO_NEW in case we are doing a
+                         * If we didn't have a valid mapping then we need to
-                         * small write at EOF that is extending the file but
+                         * put the new mapping into a separate ioend structure.
-                         * without needing an allocation. We need to update the
+                         * This ensures non-contiguous extents always have
-                         * file size on I/O completion in this case so it is
+                         * separate ioends, which is particularly important
-                         * the same case as having just allocated a new extent
+                         * for unwritten extent conversion at I/O completion
-                         * that we are writing into for the first time.
+                         * time.
                         */
-                        type = IO_NEW;
+                        new_ioend = 1;
-                        if (trylock_buffer(bh)) {
+                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                if (imap_valid)
+                                             nonblocking);
-                                        all_bh = 1;
+                        if (err)
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                goto error;
-                                                &ioend, !imap_valid);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                                count++;
+                }
-                        } else {
+                if (imap_valid) {
-                                imap_valid = 0;
+                        lock_buffer(bh);
-                        }
+                        if (type != IO_OVERWRITE)
-                } else if (PageUptodate(page)) {
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        ASSERT(buffer_mapped(bh));
+                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                        imap_valid = 0;
+                                         new_ioend);
+                        count++;
                }
                if (!iohead)
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
                        end_index = last_index;
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                        wbc, all_bh, end_index);
+                                  wbc, end_index);
        }
        if (iohead)
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
        int                     create,
        int                     direct)
 {
-        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     lockmode = 0;
        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     nimap = 1;
        int                     new = 0;
-        int                     error;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        if (direct && create)
+        if (create) {
-                flags |= BMAPI_DIRECT;
+                lockmode = XFS_ILOCK_EXCL;
+                xfs_ilock(ip, lockmode);
+        } else {
+                lockmode = xfs_ilock_map_shared(ip);
+        }
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + size > mp->m_maxioffset)
+                size = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-                          &new);
+                          XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
        if (error)
-                return -error;
+                goto out_unlock;
-        if (nimap == 0)
-                return 0;
+        if (create &&
+            (!nimaps ||
+             (imap.br_startblock == HOLESTARTBLOCK ||
+              imap.br_startblock == DELAYSTARTBLOCK))) {
+                if (direct) {
+                        error = xfs_iomap_write_direct(ip, offset, size,
+                                                       &imap, nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
+                }
+                if (error)
+                        goto out_unlock;
+                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+        } else if (nimaps) {
+                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+        } else {
+                trace_xfs_get_blocks_notfound(ip, offset, size);
+                goto out_unlock;
+        }
+        xfs_iunlock(ip, lockmode);
        if (imap.br_startblock != HOLESTARTBLOCK &&
            imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
        }
        return 0;
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        return -error;
 }
 int
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
        ssize_t                 ret;
        if (rw & WRITE) {
-                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+                iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_DIRECT = 0,  /* special case for direct I/O ioends */
+        IO_DELALLOC,    /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+#define XFS_IO_TYPES \
+        { 0,                    "" }, \
+        { IO_DELALLOC,          "delalloc" }, \
+        { IO_UNWRITTEN,         "unwritten" }, \
+        { IO_OVERWRITE,         "overwrite" }
+/*
 * xfs_ioend struct manages large extent writes for XFS.
 * It can manage several multi-page bio's at once.
 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6e9e31..92f1f2acc6ab 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-        .shrink = xfsbufd_wakeup,
-        .seeks = DEFAULT_SEEKS,
-};
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
 }
 /*
- *      Internal xfs_buf_t object manipulation
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
 */
+STATIC void
+xfs_buf_lru_add(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        spin_lock(&btp->bt_lru_lock);
+        if (list_empty(&bp->b_lru)) {
+                atomic_inc(&bp->b_hold);
+                list_add_tail(&bp->b_lru, &btp->bt_lru);
+                btp->bt_lru_nr++;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        if (list_empty(&bp->b_lru))
+                return;
+        spin_lock(&btp->bt_lru_lock);
+        if (!list_empty(&bp->b_lru)) {
+                list_del_init(&bp->b_lru);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+        struct xfs_buf  *bp)
+{
+        bp->b_flags |= XBF_STALE;
+        atomic_set(&(bp)->b_lru_ref, 0);
+        if (!list_empty(&bp->b_lru)) {
+                struct xfs_buftarg *btp = bp->b_target;
+                spin_lock(&btp->bt_lru_lock);
+                if (!list_empty(&bp->b_lru)) {
+                        list_del_init(&bp->b_lru);
+                        btp->bt_lru_nr--;
+                        atomic_dec(&bp->b_hold);
+                }
+                spin_unlock(&btp->bt_lru_lock);
+        }
+        ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 STATIC void
 _xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
+        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
+        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
        RB_CLEAR_NODE(&bp->b_rbnode);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
+        ASSERT(list_empty(&bp->b_lru));
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -828,6 +897,7 @@ xfs_buf_rele(
        if (!pag) {
                ASSERT(!bp->b_relse);
+                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
@@ -835,13 +905,19 @@ xfs_buf_rele(
        }
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
                        spin_unlock(&pag->pag_buf_lock);
                        bp->b_relse(bp);
+                } else if (!(bp->b_flags & XBF_STALE) &&
+                           atomic_read(&bp->b_lru_ref)) {
+                        xfs_buf_lru_add(bp);
+                        spin_unlock(&pag->pag_buf_lock);
                } else {
+                        xfs_buf_lru_del(bp);
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
                        spin_unlock(&pag->pag_buf_lock);
@@ -1438,51 +1514,84 @@ xfs_buf_iomove(
 */
 /*
- *      Wait for any bufs with callbacks that have been submitted but
+ * Wait for any bufs with callbacks that have been submitted but have not yet
- *      have not yet returned... walk the hash list for the target.
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
 */
 void
 xfs_wait_buftarg(
        struct xfs_buftarg      *btp)
 {
-        struct xfs_perag        *pag;
+        struct xfs_buf          *bp;
-        uint                    i;
-        for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+restart:
-                pag = xfs_perag_get(btp->bt_mount, i);
+        spin_lock(&btp->bt_lru_lock);
-                spin_lock(&pag->pag_buf_lock);
+        while (!list_empty(&btp->bt_lru)) {
-                while (rb_first(&pag->pag_buf_tree)) {
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-                        spin_unlock(&pag->pag_buf_lock);
+                if (atomic_read(&bp->b_hold) > 1) {
+                        spin_unlock(&btp->bt_lru_lock);
                        delay(100);
-                        spin_lock(&pag->pag_buf_lock);
+                        goto restart;
                }
-                spin_unlock(&pag->pag_buf_lock);
+                /*
-                xfs_perag_put(pag);
+                 * clear the LRU reference count so the bufer doesn't get
+                 * ignored in xfs_buf_rele().
+                 */
+                atomic_set(&bp->b_lru_ref, 0);
+                spin_unlock(&btp->bt_lru_lock);
+                xfs_buf_rele(bp);
+                spin_lock(&btp->bt_lru_lock);
        }
+        spin_unlock(&btp->bt_lru_lock);
 }
-/*
+int
- *      buftarg list for delwrite queue processing
+xfs_buftarg_shrink(
- */
+        struct shrinker         *shrink,
-static LIST_HEAD(xfs_buftarg_list);
+        int                     nr_to_scan,
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
+        gfp_t                   mask)
-STATIC void
-xfs_register_buftarg(
-        xfs_buftarg_t           *btp)
 {
-        spin_lock(&xfs_buftarg_lock);
+        struct xfs_buftarg      *btp = container_of(shrink,
-        list_add(&btp->bt_list, &xfs_buftarg_list);
+                                        struct xfs_buftarg, bt_shrinker);
-        spin_unlock(&xfs_buftarg_lock);
+        struct xfs_buf          *bp;
-}
+        LIST_HEAD(dispose);
-STATIC void
+        if (!nr_to_scan)
-xfs_unregister_buftarg(
+                return btp->bt_lru_nr;
-        xfs_buftarg_t           *btp)
-{
+        spin_lock(&btp->bt_lru_lock);
-        spin_lock(&xfs_buftarg_lock);
+        while (!list_empty(&btp->bt_lru)) {
-        list_del(&btp->bt_list);
+                if (nr_to_scan-- <= 0)
-        spin_unlock(&xfs_buftarg_lock);
+                        break;
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+                /*
+                 * Decrement the b_lru_ref count unless the value is already
+                 * zero. If the value is already zero, we need to reclaim the
+                 * buffer, otherwise it gets another trip through the LRU.
+                 */
+                if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
+                        continue;
+                }
+                /*
+                 * remove the buffer from the LRU now to avoid needing another
+                 * lock round trip inside xfs_buf_rele().
+                 */
+                list_move(&bp->b_lru, &dispose);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+        while (!list_empty(&dispose)) {
+                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+                list_del_init(&bp->b_lru);
+                xfs_buf_rele(bp);
+        }
+        return btp->bt_lru_nr;
 }
 void
@@ -1490,17 +1599,14 @@ xfs_free_buftarg(
        struct xfs_mount        *mp,
        struct xfs_buftarg      *btp)
 {
+        unregister_shrinker(&btp->bt_shrinker);
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
        iput(btp->bt_mapping->host);
-        /* Unregister the buftarg first so that we don't get a
-         * wakeup finding a non-existent task
-         */
-        xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
        kmem_free(btp);
 }
@@ -1597,20 +1703,13 @@ xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-        int     error = 0;
-        INIT_LIST_HEAD(&btp->bt_list);
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-        if (IS_ERR(btp->bt_task)) {
+        if (IS_ERR(btp->bt_task))
-                error = PTR_ERR(btp->bt_task);
+                return PTR_ERR(btp->bt_task);
-                goto out_error;
+        return 0;
-        }
-        xfs_register_buftarg(btp);
-out_error:
-        return error;
 }
 xfs_buftarg_t *
@@ -1627,12 +1726,17 @@ xfs_alloc_buftarg(
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
+        INIT_LIST_HEAD(&btp->bt_lru);
+        spin_lock_init(&btp->bt_lru_lock);
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
+        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+        register_shrinker(&btp->bt_shrinker);
        return btp;
 error:
@@ -1737,27 +1841,6 @@ xfs_buf_runall_queues(
        flush_workqueue(queue);
 }
-STATIC int
-xfsbufd_wakeup(
-        struct shrinker         *shrink,
-        int                     priority,
-        gfp_t                   mask)
-{
-        xfs_buftarg_t           *btp;
-        spin_lock(&xfs_buftarg_lock);
-        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-                        continue;
-                if (list_empty(&btp->bt_delwrite_queue))
-                        continue;
-                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-                wake_up_process(btp->bt_task);
-        }
-        spin_unlock(&xfs_buftarg_lock);
-        return 0;
-}
 /*
 * Move as many buffers as specified to the supplied list
 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1952,7 +2035,6 @@ xfs_buf_init(void)
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
-        register_shrinker(&xfs_buf_shake);
        return 0;
 out_destroy_xfsdatad_workqueue:
@@ -1968,7 +2050,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf98..a76c2428faff 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
        /* per device delwri queue */
        struct task_struct      *bt_task;
-        struct list_head        bt_list;
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
+        /* LRU control structures */
+        struct shrinker         bt_shrinker;
+        struct list_head        bt_lru;
+        spinlock_t              bt_lru_lock;
+        unsigned int            bt_lru_nr;
 } xfs_buftarg_t;
 /*
@@ -164,9 +169,11 @@ typedef struct xfs_buf {
        xfs_off_t               b_file_offset;  /* offset in file */
        size_t                  b_buffer_length;/* size of buffer in bytes */
        atomic_t                b_hold;         /* reference count */
+        atomic_t                b_lru_ref;      /* lru reclaim ref count */
        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
+        struct list_head        b_lru;          /* lru list */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;         /* contains rbtree root */
@@ -264,7 +271,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)   ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)       xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp) do {                            \
@@ -328,9 +336,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)                ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)       ((bp)->b_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    do { } while (0)
+static inline void
+xfs_buf_set_ref(
+        struct xfs_buf  *bp,
+        int             lru_ref)
+{
+        atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
 #define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..fc0114da7fdd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
-        /* filesystem may contain 64bit inode numbers */
+        /*
-        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+         * If the the filesystem may contain 64bit inode numbers, we need
+         * to use larger file handles that can represent them.
+         *
+         * While we only allocate inodes that do not fit into 32 bits any
+         * large enough filesystem may contain them, thus the slightly
+         * confusing looking conditional below.
+         */
+        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+            (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
                fileid_type |= XFS_FILEID_TYPE_64FLAG;
        /*
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff79..096494997747 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
 #include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 064f964d4f3c..c51faaa5e291 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -834,8 +834,11 @@ xfsaild_wakeup(
        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-        ailp->xa_target = threshold_lsn;
+        /* only ever move the target forwards */
-        wake_up_process(ailp->xa_task);
+        if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
+                ailp->xa_target = threshold_lsn;
+                wake_up_process(ailp->xa_task);
+        }
 }
 STATIC int
@@ -847,8 +850,17 @@ xfsaild(
        long            tout = 0; /* milliseconds */
        while (!kthread_should_stop()) {
-                schedule_timeout_interruptible(tout ?
+                /*
-                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+                 * for short sleeps indicating congestion, don't allow us to
+                 * get woken early. Otherwise all we do is bang on the AIL lock
+                 * without making progress.
+                 */
+                if (tout && tout <= 20)
+                        __set_current_state(TASK_KILLABLE);
+                else
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(tout ?
+                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
                /* swsusp */
                try_to_freeze();
@@ -1118,6 +1130,8 @@ xfs_fs_evict_inode(
         */
        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
        xfs_inactive(ip);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1c..a02480de9759 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
+        ASSERT(rcu_read_lock_held());
+        /*
+         * check for stale RCU freed inode
+         *
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!ip->i_ino)
+                goto out_unlock_noent;
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                return ENOENT;
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
        /* inode is valid */
        return 0;
+out_unlock_noent:
+        spin_unlock(&ip->i_flags_lock);
+        return ENOENT;
 }
 STATIC int
@@ -98,12 +118,12 @@ restart:
                int             error = 0;
                int             i;
-                read_lock(&pag->pag_ici_lock);
+                rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        break;
                }
@@ -118,18 +138,26 @@ restart:
                                batch[i] = NULL;
                        /*
-                         * Update the index for the next lookup. Catch overflows
+                         * Update the index for the next lookup. Catch
-                         * into the next AG range which can occur if we have inodes
+                         * overflows into the next AG range which can occur if
-                         * in the last block of the AG and we are currently
+                         * we have inodes in the last block of the AG and we
-                         * pointing to the last inode.
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
                /* unlock now we've grabbed the inodes. */
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -592,12 +620,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
        /*
-         * do some unlocked checks first to avoid unnecceary lock traffic.
+         * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
+                /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
@@ -795,12 +833,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +902,14 @@ restart:
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
-                        write_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                                write_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                break;
                        }
@@ -891,14 +929,24 @@ restart:
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
                        /* unlock now we've grabbed the inodes. */
-                        write_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c594..647af2a2e7aa 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
-                __field(void *, reserve_headq)
+                __field(int, reserveq)
-                __field(void *, write_headq)
+                __field(int, writeq)
                __field(int, grant_reserve_cycle)
                __field(int, grant_reserve_bytes)
                __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-                __entry->reserve_headq = log->l_reserve_headq;
+                __entry->reserveq = list_empty(&log->l_reserveq);
-                __entry->write_headq = log->l_write_headq;
+                __entry->writeq = list_empty(&log->l_writeq);
-                __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+                xlog_crack_grant_head(&log->l_grant_reserve_head,
-                __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+                                &__entry->grant_reserve_cycle,
-                __entry->grant_write_cycle = log->l_grant_write_cycle;
+                                &__entry->grant_reserve_bytes);
-                __entry->grant_write_bytes = log->l_grant_write_bytes;
+                xlog_crack_grant_head(&log->l_grant_write_head,
+                                &__entry->grant_write_cycle,
+                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
-                __entry->tail_lsn = log->l_tail_lsn;
+                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+                  "t_unit_res %u t_flags %s reserveq %s "
-                  "write_headq 0x%p grant_reserve_cycle %d "
+                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                  __entry->reserve_headq,
+                  __entry->reserveq ? "empty" : "active",
-                  __entry->write_headq,
+                  __entry->writeq ? "empty" : "active",
                  __entry->grant_reserve_cycle,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                 int flags, struct xfs_bmbt_irec *irec),
+                 int type, struct xfs_bmbt_irec *irec),
-        TP_ARGS(ip, offset, count, flags, irec),
+        TP_ARGS(ip, offset, count, type, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
-                __field(int, flags)
+                __field(int, type)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
-                __entry->flags = flags;
+                __entry->type = type;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                  "offset 0x%llx count %zd flags %s "
+                  "offset 0x%llx count %zd type %s "
                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+                  __print_symbolic(__entry->type, XFS_IO_TYPES),
                  __entry->startoff,
                  (__int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 #define DEFINE_IOMAP_EVENT(name)        \
-DEFINE_EVENT(xfs_iomap_class, name,     \
+DEFINE_EVENT(xfs_imap_class, name,      \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                 int flags, struct xfs_bmbt_irec *irec),                \
+                 int type, struct xfs_bmbt_irec *irec),         \
-        TP_ARGS(ip, offset, count, flags, irec))
+        TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a12..d22aa3103106 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
-        sv_destroy(&dqp->q_pinwait);
        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
        atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c022..58632cc17f2d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        rwlock_t        pag_ici_lock;   /* incore inode lock */
+        spinlock_t      pag_ici_lock;   /* incore inode cache lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca5..fa8723f5870a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -577,61 +577,58 @@ xfs_alloc_ag_vextent_exact(
        xfs_extlen_t    rlen;   /* length of returned extent */
        ASSERT(args->alignment == 1);
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO);
+                                          args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
         * if any free block does.
         */
-        if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+        if (error)
                goto error0;
-        if (!i) {
+        if (!i)
-                /*
+                goto not_found;
-                 * Didn't find it, return null.
-                 */
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * Grab the freespace record.
         */
-        if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+        if (error)
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
        minend = args->agbno + args->minlen;
        maxend = args->agbno + args->maxlen;
        fend = fbno + flen;
        /*
         * Give up if the freespace isn't long enough for the minimum request.
         */
-        if (fend < minend) {
+        if (fend < minend)
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * End of extent will be smaller of the freespace end and the
         * maximal requested end.
-         */
+         *
-        end = XFS_AGBLOCK_MIN(fend, maxend);
-        /*
         * Fix the length according to mod and prod if given.
         */
+        end = XFS_AGBLOCK_MIN(fend, maxend);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
-        if (!xfs_alloc_fix_minleft(args)) {
+        if (!xfs_alloc_fix_minleft(args))
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                return 0;
-        }
        rlen = args->len;
        ASSERT(args->agbno + rlen <= fend);
        end = args->agbno + rlen;
        /*
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +637,25 @@ xfs_alloc_ag_vextent_exact(
                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-                        args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+                                      args->len, XFSA_FIXUP_BNO_OK);
+        if (error) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
                goto error0;
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-        trace_xfs_alloc_exact_done(args);
        args->wasfromfl = 0;
+        trace_xfs_alloc_exact_done(args);
+        return 0;
+not_found:
+        /* Didn't find it, return null. */
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        args->agbno = NULLAGBLOCK;
+        trace_xfs_alloc_exact_notfound(args);
        return 0;
 error0:
@@ -659,6 +665,95 @@ error0:
 }
 /*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+        struct xfs_alloc_arg    *args,  /* allocation argument structure */
+        struct xfs_btree_cur    **gcur, /* good cursor */
+        struct xfs_btree_cur    **scur, /* searching cursor */
+        xfs_agblock_t           gdiff,  /* difference for search comparison */
+        xfs_agblock_t           *sbno,  /* extent found by search */
+        xfs_extlen_t            *slen,
+        xfs_extlen_t            *slena, /* aligned length */
+        int                     dir)    /* 0 = search right, 1 = search left */
+{
+        xfs_agblock_t           bno;
+        xfs_agblock_t           new;
+        xfs_agblock_t           sdiff;
+        int                     error;
+        int                     i;
+        /* The good extent is perfect, no need to  search. */
+        if (!gdiff)
+                goto out_use_good;
+        /*
+         * Look until we find a better one, run out of space or run off the end.
+         */
+        do {
+                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+                                          args->minlen, &bno, slena);
+                /*
+                 * The good extent is closer than this one.
+                 */
+                if (!dir) {
+                        if (bno >= args->agbno + gdiff)
+                                goto out_use_good;
+                } else {
+                        if (bno <= args->agbno - gdiff)
+                                goto out_use_good;
+                }
+                /*
+                 * Same distance, compare length and pick the best.
+                 */
+                if (*slena >= args->minlen) {
+                        args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                       args->alignment, *sbno,
+                                                       *slen, &new);
+                        /*
+                         * Choose closer size and invalidate other cursor.
+                         */
+                        if (sdiff < gdiff)
+                                goto out_use_search;
+                        goto out_use_good;
+                }
+                if (!dir)
+                        error = xfs_btree_increment(*scur, 0, &i);
+                else
+                        error = xfs_btree_decrement(*scur, 0, &i);
+                if (error)
+                        goto error0;
+        } while (i);
+out_use_good:
+        xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+        *scur = NULL;
+        return 0;
+out_use_search:
+        xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+        *gcur = NULL;
+        return 0;
+error0:
+        /* caller invalidates cursors */
+        return error;
+}
+/*
 * Allocate a variable extent near bno in the allocation group agno.
 * Extent's length (returned in len) will be between minlen and maxlen,
 * and of the form k * prod + mod unless there's nothing that large.
@@ -925,203 +1020,45 @@ xfs_alloc_ag_vextent_near(
                        }
                }
        } while (bno_cur_lt || bno_cur_gt);
        /*
         * Got both cursors still active, need to find better entry.
         */
        if (bno_cur_lt && bno_cur_gt) {
-                /*
-                 * Left side is long enough, look for a right side entry.
-                 */
                if (ltlena >= args->minlen) {
                        /*
-                         * Fix up the length.
+                         * Left side is good, look for a right side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, ltbno, ltlen, &ltnew);
+                        error = xfs_alloc_find_best_extent(args,
+                                                &bno_cur_lt, &bno_cur_gt,
+                                                ltdiff, &gtbno, &gtlen, &gtlena,
+                                                0 /* search right */);
+                } else {
+                        ASSERT(gtlena >= args->minlen);
                        /*
-                         * Not perfect.
+                         * Right side is good, look for a left side entry.
-                         */
-                        if (ltdiff) {
-                                /*
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_gt, &gtbno,
-                                                        &gtlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(gtbno, gtlen,
-                                                args->alignment, args->minlen,
-                                                &gtbnoa, &gtlena);
-                                        /*
-                                         * The left one is clearly better.
-                                         */
-                                        if (gtbnoa >= args->agbno + ltdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (gtlena >= args->minlen) {
-                                                args->len =
-                                                        XFS_EXTLEN_MIN(gtlena,
-                                                                args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                gtdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        gtbno, gtlen, &gtnew);
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                if (gtdiff < ltdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the right end.
-                                         */
-                                        if ((error = xfs_btree_increment(
-                                                        bno_cur_gt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The left side is perfect, trash the right side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_gt,
-                                                     XFS_BTREE_NOERROR);
-                                bno_cur_gt = NULL;
-                        }
-                }
-                /*
-                 * It's the right side that was found first, look left.
-                 */
-                else {
-                        /*
-                         * Fix up the length.
                         */
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, gtbno, gtlen, &gtnew);
-                        /*
-                         * Right side entry isn't perfect.
+                        error = xfs_alloc_find_best_extent(args,
-                         */
+                                                &bno_cur_gt, &bno_cur_lt,
-                        if (gtdiff) {
+                                                gtdiff, &ltbno, &ltlen, &ltlena,
-                                /*
+                                                1 /* search left */);
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_lt, &ltbno,
-                                                        &ltlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(ltbno, ltlen,
-                                                args->alignment, args->minlen,
-                                                &ltbnoa, &ltlena);
-                                        /*
-                                         * The right one is clearly better.
-                                         */
-                                        if (ltbnoa <= args->agbno - gtdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (ltlena >= args->minlen) {
-                                                args->len = XFS_EXTLEN_MIN(
-                                                        ltlena, args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                ltdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        ltbno, ltlen, &ltnew);
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                if (ltdiff < gtdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the left end.
-                                         */
-                                        if ((error = xfs_btree_decrement(
-                                                        bno_cur_lt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The right side is perfect, trash the left side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_lt,
-                                        XFS_BTREE_NOERROR);
-                                bno_cur_lt = NULL;
-                        }
                }
+                if (error)
+                        goto error0;
        }
        /*
         * If we couldn't get anything, give up.
         */
@@ -1130,6 +1067,7 @@ xfs_alloc_ag_vextent_near(
                args->agbno = NULLAGBLOCK;
                return 0;
        }
        /*
         * At this point we have selected a freespace entry, either to the
         * left or to the right.  If it's on the right, copy all the
@@ -1146,6 +1084,7 @@ xfs_alloc_ag_vextent_near(
                j = 1;
        } else
                j = 0;
        /*
         * Fix up the length and compute the useful address.
         */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
        /*
         * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
-                                args.value = kmem_alloc(valuelen, KM_SLEEP);
+                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7e..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
                return error;
        }
        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-        if (bp != NULL) {
+        if (bp)
                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-        }
        *bpp = bp;
        return 0;
 }
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
        switch (cur->bc_btnum) {
        case XFS_BTNUM_BNO:
        case XFS_BTNUM_CNT:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
                break;
        default:
                ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5b..ed2b65f3f8b9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -142,7 +142,7 @@ xfs_buf_item_log_check(
 #endif
 STATIC void     xfs_buf_error_relse(xfs_buf_t *bp);
-STATIC void     xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
+STATIC void     xfs_buf_do_callbacks(struct xfs_buf *bp);
 /*
 * This returns the number of log iovecs needed to log the
@@ -450,7 +450,7 @@ xfs_buf_item_unpin(
                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+                        xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
@@ -918,15 +918,26 @@ xfs_buf_attach_iodone(
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
 STATIC void
 xfs_buf_do_callbacks(
-        xfs_buf_t       *bp,
+        struct xfs_buf          *bp)
-        xfs_log_item_t  *lip)
 {
-        xfs_log_item_t  *nlip;
+        struct xfs_log_item     *lip;
-        while (lip != NULL) {
+        while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
-                nlip = lip->li_bio_list;
+                XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
                ASSERT(lip->li_cb != NULL);
                /*
                 * Clear the next pointer so we don't have any
@@ -936,7 +947,6 @@ xfs_buf_do_callbacks(
                 */
                lip->li_bio_list = NULL;
                lip->li_cb(bp, lip);
-                lip = nlip;
        }
 }
@@ -970,7 +980,7 @@ xfs_buf_iodone_callbacks(
                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
                        XFS_BUF_SUPER_STALE(bp);
                        trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                        xfs_buf_do_callbacks(bp, lip);
+                        xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                        xfs_buf_ioend(bp, 0);
@@ -1029,7 +1039,7 @@ xfs_buf_iodone_callbacks(
                return;
        }
-        xfs_buf_do_callbacks(bp, lip);
+        xfs_buf_do_callbacks(bp);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
        xfs_buf_ioend(bp, 0);
@@ -1063,7 +1073,7 @@ xfs_buf_error_relse(
         * We have to unpin the pinned buffers so do the
         * callbacks.
         */
-        xfs_buf_do_callbacks(bp, lip);
+        xfs_buf_do_callbacks(bp);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
        xfs_buf_log_format_t    bli_format;     /* in-log header */
 } xfs_buf_log_item_t;
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
-        xfs_daddr_t             bc_blkno;
-        uint                    bc_len;
-        int                     bc_refcount;
-        struct xfs_buf_cancel   *bc_next;
-} xfs_buf_cancel_t;
 void    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void    xfs_buf_item_relse(struct xfs_buf *);
 void    xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..75f2ef60e579 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
 }
 /*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+        struct xfs_efi_log_item *efip)
+{
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+                spin_lock(&ailp->xa_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
+                xfs_trans_ail_delete(ailp, &efip->efi_item);
+                xfs_efi_item_free(efip);
+        }
+}
+/*
 * This returns the number of iovecs needed to log the given efi item.
 * We only need 1 iovec for an efi item.  It just logs the efi_log_format
 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
        uint                    size;
-        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+        ASSERT(atomic_read(&efip->efi_next_extent) ==
+                                efip->efi_format.efi_nextents);
        efip->efi_format.efi_type = XFS_LI_EFI;
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 }
 /*
- * While EFIs cannot really be pinned, the unpin operation is the
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
- * last place at which the EFI is manipulated during a transaction.
+ * which the EFI is manipulated during a transaction.  If we are being asked to
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * remove the EFI it's because the transaction has been cancelled and by
- * free the EFI.
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
 */
 STATIC void
 xfs_efi_item_unpin(
@@ -110,20 +135,14 @@ xfs_efi_item_unpin(
        int                     remove)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-        struct xfs_ail          *ailp = lip->li_ailp;
-        spin_lock(&ailp->xa_lock);
-        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                if (remove)
-                        xfs_trans_del_item(lip);
-                /* xfs_trans_ail_delete() drops the AIL lock. */
+        if (remove) {
-                xfs_trans_ail_delete(ailp, lip);
+                ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+                xfs_trans_del_item(lip);
                xfs_efi_item_free(efip);
-        } else {
+                return;
-                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&ailp->xa_lock);
        }
+        __xfs_efi_release(efip);
 }
 /*
@@ -152,16 +171,20 @@ xfs_efi_item_unlock(
 }
 /*
- * The EFI is logged only once and cannot be moved in the log, so
+ * The EFI is logged only once and cannot be moved in the log, so simply return
- * simply return the lsn at which it's been logged.  The canceled
+ * the lsn at which it's been logged.  For bulk transaction committed
- * flag is not paid any attention here.  Checking for that is delayed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * until the EFI is unpinned.
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
 */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
        return lsn;
 }
@@ -230,6 +253,7 @@ xfs_efi_init(
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
+        atomic_set(&efip->efi_next_extent, 0);
        return efip;
 }
@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 /*
- * This is called by the efd item code below to release references to
+ * This is called by the efd item code below to release references to the given
- * the given efi item.  Each efd calls this with the number of
+ * efi item.  Each efd calls this with the number of extents that it has
- * extents that it has logged, and when the sum of these reaches
+ * logged, and when the sum of these reaches the total number of extents logged
- * the total number of extents logged by this efi item we can free
+ * by this efi item we can free the efi item.
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
 */
 void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-        int                     extents_left;
+        if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+                __xfs_efi_release(efip);
-        ASSERT(efip->efi_next_extent > 0);
-        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&ailp->xa_lock);
-        ASSERT(efip->efi_next_extent >= nextents);
-        efip->efi_next_extent -= nextents;
-        extents_left = efip->efi_next_extent;
-        if (extents_left == 0) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                xfs_efi_item_free(efip);
-        } else {
-                spin_unlock(&ailp->xa_lock);
-        }
 }
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 #define XFS_EFI_MAX_FAST_EXTENTS        16
 /*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
 */
-#define XFS_EFI_RECOVERED       0x1
+#define XFS_EFI_RECOVERED       1
-#define XFS_EFI_COMMITTED       0x2
+#define XFS_EFI_COMMITTED       2
-#define XFS_EFI_CANCELED        0x4
 /*
 * This is the "extent free intention" log item.  It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
 */
 typedef struct xfs_efi_log_item {
        xfs_log_item_t          efi_item;
-        uint                    efi_flags;      /* misc flags */
+        atomic_t                efi_next_extent;
-        uint                    efi_next_extent;
+        unsigned long           efi_flags;      /* misc flags */
        xfs_efi_log_format_t    efi_format;
 } xfs_efi_log_item_t;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814af..f56d30e8040c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+        xfs_set_low_space_thresholds(mp);
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index d7de5a3f7867..cb9b6d1469f7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
 /*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+/*
 * Allocate and initialise an xfs_inode.
 */
 STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(ip->i_ino == 0);
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_active, "xfs_iolock_active");
        /* initialise the xfs inode */
        ip->i_ino = ino;
@@ -85,9 +99,6 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
-        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
@@ -145,7 +156,18 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 /*
@@ -155,14 +177,29 @@ static int
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
        int                     flags,
-        int                     lock_flags) __releases(pag->pag_ici_lock)
+        int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
        /*
         * If we are racing with another cache hit that is currently
@@ -205,7 +242,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -213,7 +250,7 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
                        ip->i_flags &= ~XFS_INEW;
@@ -223,14 +260,20 @@ xfs_iget_cache_hit(
                        goto out_error;
                }
-                write_lock(&pag->pag_ici_lock);
+                spin_lock(&pag->pag_ici_lock);
                spin_lock(&ip->i_flags_lock);
                ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
                ip->i_flags |= XFS_INEW;
                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
                inode->i_state = I_NEW;
+                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+                lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                                &xfs_iolock_active, "xfs_iolock_active");
                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
+                spin_unlock(&pag->pag_ici_lock);
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -241,7 +284,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
@@ -255,7 +298,7 @@ xfs_iget_cache_hit(
 out_error:
        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -308,7 +351,7 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -323,14 +366,14 @@ xfs_iget_cache_miss(
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        *ipp = ip;
        return 0;
 out_preload_end:
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
@@ -377,7 +420,7 @@ xfs_iget(
        xfs_agino_t     agino;
        /* reject inode numbers outside existing AGs */
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
@@ -386,15 +429,15 @@ xfs_iget(
 again:
        error = 0;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f94..be7cf625421f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-        XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        xfs_buf_set_ref(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or stale, nothing to do */
+                        /* Inode not in memory, nothing to do */
-                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+                        if (!ip) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                continue;
                        }
                        /*
+                         * because this is an RCU protected lookup, we could
+                         * find a recently freed or even reallocated inode
+                         * during the lookup. We need to check under the
+                         * i_flags_lock for a valid inode here. Skip it if it
+                         * is not valid, the wrong inode or stale.
+                         */
+                        spin_lock(&ip->i_flags_lock);
+                        if (ip->i_ino != inum + i ||
+                            __xfs_iflags_test(ip, XFS_ISTALE)) {
+                                spin_unlock(&ip->i_flags_lock);
+                                rcu_read_unlock();
+                                continue;
+                        }
+                        spin_unlock(&ip->i_flags_lock);
+                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-                /* if the inode lies outside this cluster, we're done. */
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                /*
-                        break;
+                 * because this is an RCU protected lookup, we could find a
+                 * recently freed or even reallocated inode during the lookup.
+                 * We need to check under the i_flags_lock for a valid inode
+                 * here. Skip it if it is not valid or the wrong inode.
+                 */
+                spin_lock(&ip->i_flags_lock);
+                if (!ip->i_ino ||
+                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                        spin_unlock(&ip->i_flags_lock);
+                        continue;
+                }
+                spin_unlock(&ip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
        }
 out_free:
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc9..5c95fa8ec11d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * In-core inode flags.
 */
-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
+#define XFS_IRECLAIM            0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE      0x0002  /* inode has been staled */
+#define XFS_ISTALE              0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_IRECLAIMABLE        0x0004  /* inode can be reclaimed */
-#define XFS_INEW        0x0008  /* inode has just been allocated */
+#define XFS_INEW                0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
+#define XFS_IFILESTREAM         0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
+#define XFS_ITRUNCATED          0x0020  /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE      0x0040  /* dirty release already seen */
 /*
 * Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+extern struct lock_class_key xfs_iolock_reclaimable;
 /*
 * Flags for xfs_itruncate_start().
 */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c8d30c453c3..fd4f398bd6f1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
 * flushed to disk.  It is responsible for removing the inode item
 * from the AIL if it has not been re-logged, and unlocking the inode's
 * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
 */
 void
 xfs_iflush_done(
        struct xfs_buf          *bp,
        struct xfs_log_item     *lip)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode_log_item *iip;
-        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_log_item     *blip;
+        struct xfs_log_item     *next;
+        struct xfs_log_item     *prev;
        struct xfs_ail          *ailp = lip->li_ailp;
+        int                     need_ail = 0;
+        /*
+         * Scan the buffer IO completions for other inodes being completed and
+         * attach them to the current inode log item.
+         */
+        blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        prev = NULL;
+        while (blip != NULL) {
+                if (lip->li_cb != xfs_iflush_done) {
+                        prev = blip;
+                        blip = blip->li_bio_list;
+                        continue;
+                }
+                /* remove from list */
+                next = blip->li_bio_list;
+                if (!prev) {
+                        XFS_BUF_SET_FSPRIVATE(bp, next);
+                } else {
+                        prev->li_bio_list = next;
+                }
+                /* add to current list */
+                blip->li_bio_list = lip->li_bio_list;
+                lip->li_bio_list = blip;
+                /*
+                 * while we have the item, do the unlocked check for needing
+                 * the AIL lock.
+                 */
+                iip = INODE_ITEM(blip);
+                if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+                        need_ail++;
+                blip = next;
+        }
+        /* make sure we capture the state of the initial inode. */
+        iip = INODE_ITEM(lip);
+        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+                need_ail++;
        /*
         * We only want to pull the item from the AIL if it is
@@ -861,28 +910,37 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+        if (need_ail) {
+                struct xfs_log_item *log_items[need_ail];
+                int i = 0;
                spin_lock(&ailp->xa_lock);
-                if (lip->li_lsn == iip->ili_flush_lsn) {
+                for (blip = lip; blip; blip = blip->li_bio_list) {
-                        /* xfs_trans_ail_delete() drops the AIL lock. */
+                        iip = INODE_ITEM(blip);
-                        xfs_trans_ail_delete(ailp, lip);
+                        if (iip->ili_logged &&
-                } else {
+                            blip->li_lsn == iip->ili_flush_lsn) {
-                        spin_unlock(&ailp->xa_lock);
+                                log_items[i++] = blip;
+                        }
+                        ASSERT(i <= need_ail);
                }
+                /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+                xfs_trans_ail_delete_bulk(ailp, log_items, i);
        }
-        iip->ili_logged = 0;
        /*
-         * Clear the ili_last_fields bits now that we know that the
+         * clean up and unlock the flush lock now we are done. We can clear the
-         * data corresponding to them is safely on disk.
+         * ili_last_fields bits now that we know that the data corresponding to
+         * them is safely on disk.
         */
-        iip->ili_last_fields = 0;
+        for (blip = lip; blip; blip = next) {
+                next = blip->li_bio_list;
+                blip->li_bio_list = NULL;
-        /*
+                iip = INODE_ITEM(blip);
-         * Release the inode's flush lock since we're done with it.
+                iip->ili_logged = 0;
-         */
+                iip->ili_last_fields = 0;
-        xfs_ifunlock(ip);
+                xfs_ifunlock(iip->ili_inode);
+        }
 }
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..55582bd66659 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
-int
-xfs_iomap(
-        struct xfs_inode        *ip,
-        xfs_off_t               offset,
-        ssize_t                 count,
-        int                     flags,
-        struct xfs_bmbt_irec    *imap,
-        int                     *nimaps,
-        int                     *new)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           offset_fsb, end_fsb;
-        int                     error = 0;
-        int                     lockmode = 0;
-        int                     bmapi_flags = 0;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        *new = 0;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-        switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
-        case BMAPI_READ:
-                lockmode = xfs_ilock_map_shared(ip);
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                break;
-        case BMAPI_WRITE:
-                lockmode = XFS_ILOCK_EXCL;
-                if (flags & BMAPI_IGNSTATE)
-                        bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
-                xfs_ilock(ip, lockmode);
-                break;
-        case BMAPI_ALLOCATE:
-                lockmode = XFS_ILOCK_SHARED;
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                /* Attempt non-blocking lock */
-                if (flags & BMAPI_TRYLOCK) {
-                        if (!xfs_ilock_nowait(ip, lockmode))
-                                return XFS_ERROR(EAGAIN);
-                } else {
-                        xfs_ilock(ip, lockmode);
-                }
-                break;
-        default:
-                BUG();
-        }
-        ASSERT(offset <= mp->m_maxioffset);
-        if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-                count = mp->m_maxioffset - offset;
-        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_bmapi(NULL, ip, offset_fsb,
-                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, imap,
-                        nimaps, NULL);
-        if (error)
-                goto out;
-        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
-        case BMAPI_WRITE:
-                /* If we found an extent, return it */
-                if (*nimaps &&
-                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                if (flags & BMAPI_DIRECT) {
-                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       imap, nimaps);
-                } else {
-                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      imap, nimaps);
-                }
-                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
-                }
-                *new = 1;
-                break;
-        case BMAPI_ALLOCATE:
-                /* If we found an extent, return it */
-                xfs_iunlock(ip, lockmode);
-                lockmode = 0;
-                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 imap, nimaps);
-                break;
-        }
-        ASSERT(*nimaps <= 1);
-out:
-        if (lockmode)
-                xfs_iunlock(ip, lockmode);
-        return XFS_ERROR(error);
-}
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-STATIC int
+int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             flags,
        xfs_bmbt_irec_t *imap,
-        int             *nmaps)
+        int             nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+                if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        imap->br_blockcount +
                                        imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip);
        bmapi_flag = XFS_BMAPI_WRITE;
-        if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+        if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
                goto error_out;
        }
-        *nmaps = 1;
        return 0;
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-        *nmaps = 0;     /* nothing set-up here */
 error_out:
        return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
 * If the caller is doing a write at the end of the file, then extend the
 * allocation out to the file system's write iosize.  We clean up any extra
 * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
        xfs_bmbt_irec_t *imap,
        int             nimaps,
        int             *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
        xfs_filblks_t   count_fsb;
        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
+        int             found_delalloc = 0;
        *prealloc = 0;
        if ((offset + count) <= ip->i_size)
@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate(
                                return 0;
                        start_fsb += imap[n].br_blockcount;
                        count_fsb -= imap[n].br_blockcount;
+                        if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                                found_delalloc = 1;
                }
        }
-        *prealloc = 1;
+        if (!found_delalloc)
+                *prealloc = 1;
        return 0;
 }
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip)
+{
+        xfs_fsblock_t           alloc_blocks = 0;
+        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+                int shift = 0;
+                int64_t freesp;
+                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                        rounddown_pow_of_two(alloc_blocks));
+                xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+                freesp = mp->m_sb.sb_fdblocks;
+                if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                        shift = 2;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                                shift++;
+                }
+                if (shift)
+                        alloc_blocks >>= shift;
+        }
+        if (alloc_blocks < mp->m_writeio_blocks)
+                alloc_blocks = mp->m_writeio_blocks;
+        return alloc_blocks;
+}
+int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
+        xfs_bmbt_irec_t *ret_imap)
-        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -469,16 +396,19 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+                                imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
 retry:
        if (prealloc) {
+                xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-                last_fsb = ioalign + mp->m_writeio_blocks;
+                last_fsb = ioalign + alloc_blocks;
        } else {
                last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
@@ -496,22 +426,31 @@ retry:
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-        if (error && (error != ENOSPC))
+        switch (error) {
+        case 0:
+        case ENOSPC:
+        case EDQUOT:
+                break;
+        default:
                return XFS_ERROR(error);
+        }
        /*
-         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-         * then we must have run out of space - flush all other inodes with
+         * ENOSPC, * flush all other inodes with delalloc blocks to free up
-         * delalloc blocks and retry without EOF preallocation.
+         * some of the excess reserved metadata space. For both cases, retry
+         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (flushed)
-                        return XFS_ERROR(ENOSPC);
+                        return XFS_ERROR(error ? error : ENOSPC);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error == ENOSPC) {
-                xfs_flush_inodes(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_flush_inodes(ip);
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                }
                flushed = 1;
                error = 0;
@@ -523,8 +462,6 @@ retry:
                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
        *ret_imap = imap[0];
-        *nmaps = 1;
        return 0;
 }
@@ -538,13 +475,12 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-STATIC int
+int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        xfs_bmbt_irec_t *imap,
+        xfs_bmbt_irec_t *imap)
-        int             *retmap)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb, last_block;
@@ -557,8 +493,6 @@ xfs_iomap_write_allocate(
        int             error = 0;
        int             nres;
-        *retmap = 0;
        /*
         * Make sure that the dquots are there.
         */
@@ -680,7 +614,6 @@ xfs_iomap_write_allocate(
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
                                   imap->br_blockcount))) {
-                        *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
                }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-/* base extent manipulation calls */
-#define BMAPI_READ      (1 << 0)        /* read extents */
-#define BMAPI_WRITE     (1 << 1)        /* create extents */
-#define BMAPI_ALLOCATE  (1 << 2)        /* delayed allocate to real extents */
-/* modifiers */
-#define BMAPI_IGNSTATE  (1 << 4)        /* ignore unwritten state on read */
-#define BMAPI_DIRECT    (1 << 5)        /* direct instead of buffered write */
-#define BMAPI_MMA       (1 << 6)        /* allocate for mmap write */
-#define BMAPI_TRYLOCK   (1 << 7)        /* non-blocking request */
-#define BMAPI_FLAGS \
-        { BMAPI_READ,           "READ" }, \
-        { BMAPI_WRITE,          "WRITE" }, \
-        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
-        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
-        { BMAPI_DIRECT,         "DIRECT" }, \
-        { BMAPI_TRYLOCK,        "TRYLOCK" }
 struct xfs_inode;
 struct xfs_bmbt_irec;
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                     struct xfs_bmbt_irec *, int *, int *);
+                        struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9e..0bf24b11d0c4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
                                xfs_buftarg_t   *log_target,
                                xfs_daddr_t     blk_offset,
                                int             num_bblks);
-STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int       xlog_space_left(struct log *log, atomic64_t *head);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t         *log,
                                 xlog_ticket_t  *xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t     *mp,
+STATIC void xlog_grant_push_ail(struct log      *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
                                           xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void     xlog_verify_grant_tail(struct log *log);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
 STATIC void     xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
                                     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        if (*qp) {
+        int64_t head_val = atomic64_read(head);
-                tic->t_next         = (*qp);
+        int64_t new, old;
-                tic->t_prev         = (*qp)->t_prev;
-                (*qp)->t_prev->t_next = tic;
-                (*qp)->t_prev       = tic;
-        } else {
-                tic->t_prev = tic->t_next = tic;
-                *qp = tic;
-        }
-        tic->t_flags |= XLOG_TIC_IN_Q;
+        do {
-}
+                int     cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
-        if (tic == tic->t_next) {
-                *qp = NULL;
-        } else {
-                *qp = tic->t_next;
-                tic->t_next->t_prev = tic->t_prev;
-                tic->t_prev->t_next = tic->t_next;
-        }
-        tic->t_next = tic->t_prev = NULL;
+                space -= bytes;
-        tic->t_flags &= ~XLOG_TIC_IN_Q;
+                if (space < 0) {
+                        space += log->l_logsize;
+                        cycle--;
+                }
+                old = head_val;
+                new = xlog_assign_grant_head_val(cycle, space);
+                head_val = atomic64_cmpxchg(head, old, new);
+        } while (head_val != old);
 }
 static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        log->l_grant_write_bytes -= bytes;
+        int64_t head_val = atomic64_read(head);
-        if (log->l_grant_write_bytes < 0) {
+        int64_t new, old;
-                log->l_grant_write_bytes += log->l_logsize;
-                log->l_grant_write_cycle--;
-        }
-        log->l_grant_reserve_bytes -= bytes;
-        if ((log)->l_grant_reserve_bytes < 0) {
-                log->l_grant_reserve_bytes += log->l_logsize;
-                log->l_grant_reserve_cycle--;
-        }
-}
+        do {
+                int             tmp;
+                int             cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_grant_add_space_write(struct log *log, int bytes)
-{
-        int tmp = log->l_logsize - log->l_grant_write_bytes;
-        if (tmp > bytes)
-                log->l_grant_write_bytes += bytes;
-        else {
-                log->l_grant_write_cycle++;
-                log->l_grant_write_bytes = bytes - tmp;
-        }
-}
-static void
+                tmp = log->l_logsize - space;
-xlog_grant_add_space_reserve(struct log *log, int bytes)
+                if (tmp > bytes)
-{
+                        space += bytes;
-        int tmp = log->l_logsize - log->l_grant_reserve_bytes;
+                else {
-        if (tmp > bytes)
+                        space = bytes - tmp;
-                log->l_grant_reserve_bytes += bytes;
+                        cycle++;
-        else {
+                }
-                log->l_grant_reserve_cycle++;
-                log->l_grant_reserve_bytes = bytes - tmp;
-        }
-}
-static inline void
+                old = head_val;
-xlog_grant_add_space(struct log *log, int bytes)
+                new = xlog_assign_grant_head_val(cycle, space);
-{
+                head_val = atomic64_cmpxchg(head, old, new);
-        xlog_grant_add_space_write(log, bytes);
+        } while (head_val != old);
-        xlog_grant_add_space_reserve(log, bytes);
 }
 static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+                xlog_grant_push_ail(log, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp,
+                xlog_grant_push_ail(log,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
                retval = xlog_grant_log_space(log, internal_ticket);
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                        } else {
                                spin_unlock(&log->l_icloglock);
                        }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                } else {
                        spin_unlock(&log->l_icloglock);
                }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 {
        xlog_ticket_t   *tic;
        xlog_t          *log = mp->m_log;
-        int             need_bytes, free_bytes, cycle, bytes;
+        int             need_bytes, free_bytes;
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
-        if (tail_lsn == 0) {
+        if (tail_lsn == 0)
-                /* needed since sync_lsn is 64 bits */
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                spin_lock(&log->l_icloglock);
-                tail_lsn = log->l_last_sync_lsn;
-                spin_unlock(&log->l_icloglock);
-        }
-        spin_lock(&log->l_grant_lock);
-        /* Also an invalid lsn.  1 implies that we aren't passing in a valid
+        /* tail_lsn == 1 implies that we weren't passed a valid value.  */
-         * tail_lsn.
+        if (tail_lsn != 1)
-         */
+                atomic64_set(&log->l_tail_lsn, tail_lsn);
-        if (tail_lsn != 1) {
-                log->l_tail_lsn = tail_lsn;
-        }
-        if ((tic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_write_cycle;
+                spin_lock(&log->l_grant_write_lock);
-                bytes = log->l_grant_write_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_writeq, t_queue) {
-                do {
                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_regrant_write_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_write_headq);
+                }
+                spin_unlock(&log->l_grant_write_lock);
        }
-        if ((tic = log->l_reserve_headq)) {
+        if (!list_empty_careful(&log->l_reserveq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_reserve_cycle;
+                spin_lock(&log->l_grant_reserve_lock);
-                bytes = log->l_grant_reserve_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_reserveq, t_queue) {
-                do {
                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
                                need_bytes = tic->t_unit_res*tic->t_cnt;
                        else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_grant_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_reserve_headq);
+                }
+                spin_unlock(&log->l_grant_reserve_lock);
        }
-        spin_unlock(&log->l_grant_lock);
+}
-}       /* xfs_log_move_tail */
 /*
 * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
 * We may be holding the log iclog lock upon entering this routine.
 */
 xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+        struct xfs_mount        *mp)
 {
-        xfs_lsn_t tail_lsn;
+        xfs_lsn_t               tail_lsn;
-        xlog_t    *log = mp->m_log;
+        struct log              *log = mp->m_log;
        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
-        spin_lock(&log->l_grant_lock);
+        if (!tail_lsn)
-        if (tail_lsn != 0) {
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                log->l_tail_lsn = tail_lsn;
-        } else {
-                tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
-        }
-        spin_unlock(&log->l_grant_lock);
+        atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
-}       /* xlog_assign_tail_lsn */
+}
 /*
 * Return the space in the log between the tail and the head.  The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
 * result is that we return the size of the log as the amount of space left.
 */
 STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
+xlog_space_left(
-{
+        struct log      *log,
-        int free_bytes;
+        atomic64_t      *head)
-        int tail_bytes;
+{
-        int tail_cycle;
+        int             free_bytes;
+        int             tail_bytes;
-        tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
+        int             tail_cycle;
-        tail_cycle = CYCLE_LSN(log->l_tail_lsn);
+        int             head_cycle;
-        if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+        int             head_bytes;
-                free_bytes = log->l_logsize - (bytes - tail_bytes);
-        } else if ((tail_cycle + 1) < cycle) {
+        xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+        tail_bytes = BBTOB(tail_bytes);
+        if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+                free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+        else if (tail_cycle + 1 < head_cycle)
                return 0;
-        } else if (tail_cycle < cycle) {
+        else if (tail_cycle < head_cycle) {
-                ASSERT(tail_cycle == (cycle - 1));
+                ASSERT(tail_cycle == (head_cycle - 1));
-                free_bytes = tail_bytes - bytes;
+                free_bytes = tail_bytes - head_bytes;
        } else {
                /*
                 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
-                        tail_cycle, tail_bytes, cycle, bytes);
+                        tail_cycle, tail_bytes, head_cycle, head_bytes);
                ASSERT(0);
                free_bytes = log->l_logsize;
        }
        return free_bytes;
-}       /* xlog_space_left */
+}
 /*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
        log->l_prev_block  = -1;
-        log->l_tail_lsn    = xlog_assign_lsn(1, 0);
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-        log->l_last_sync_lsn = log->l_tail_lsn;
+        xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+        xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-        log->l_grant_reserve_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-        log->l_grant_write_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+        INIT_LIST_HEAD(&log->l_reserveq);
+        INIT_LIST_HEAD(&log->l_writeq);
+        spin_lock_init(&log->l_grant_reserve_lock);
+        spin_lock_init(&log->l_grant_write_lock);
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_xbuf = bp;
        spin_lock_init(&log->l_icloglock);
-        spin_lock_init(&log->l_grant_lock);
+        init_waitqueue_head(&log->l_flush_wait);
-        sv_init(&log->l_flush_wait, 0, "flush_wait");
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+                init_waitqueue_head(&iclog->ic_force_wait);
-                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                init_waitqueue_head(&iclog->ic_write_wait);
                iclogp = &iclog->ic_next;
        }
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t	*mp,
 out_free_iclog:
        for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
                prev_iclog = iclog->ic_next;
-                if (iclog->ic_bp) {
+                if (iclog->ic_bp)
-                        sv_destroy(&iclog->ic_force_wait);
-                        sv_destroy(&iclog->ic_write_wait);
                        xfs_buf_free(iclog->ic_bp);
-                }
                kmem_free(iclog);
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
 * water mark.  In this manner, we would be creating a low water mark.
 */
 STATIC void
-xlog_grant_push_ail(xfs_mount_t *mp,
+xlog_grant_push_ail(
-                    int         need_bytes)
+        struct log      *log,
+        int             need_bytes)
 {
-    xlog_t      *log = mp->m_log;       /* pointer to the log */
+        xfs_lsn_t       threshold_lsn = 0;
-    xfs_lsn_t   tail_lsn;               /* lsn of the log tail */
+        xfs_lsn_t       last_sync_lsn;
-    xfs_lsn_t   threshold_lsn = 0;      /* lsn we'd like to be at */
+        int             free_blocks;
-    int         free_blocks;            /* free blocks left to write to */
+        int             free_bytes;
-    int         free_bytes;             /* free bytes left to write to */
+        int             threshold_block;
-    int         threshold_block;        /* block in lsn we'd like to be at */
+        int             threshold_cycle;
-    int         threshold_cycle;        /* lsn cycle we'd like to be at */
+        int             free_threshold;
-    int         free_threshold;
+        ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-    spin_lock(&log->l_grant_lock);
+        free_blocks = BTOBBT(free_bytes);
-    free_bytes = xlog_space_left(log,
-                                 log->l_grant_reserve_cycle,
+        /*
-                                 log->l_grant_reserve_bytes);
+         * Set the threshold for the minimum number of free blocks in the
-    tail_lsn = log->l_tail_lsn;
+         * log to the maximum of what the caller needs, one quarter of the
-    free_blocks = BTOBBT(free_bytes);
+         * log, and 256 blocks.
+         */
-    /*
+        free_threshold = BTOBB(need_bytes);
-     * Set the threshold for the minimum number of free blocks in the
+        free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-     * log to the maximum of what the caller needs, one quarter of the
+        free_threshold = MAX(free_threshold, 256);
-     * log, and 256 blocks.
+        if (free_blocks >= free_threshold)
-     */
+                return;
-    free_threshold = BTOBB(need_bytes);
-    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
-    free_threshold = MAX(free_threshold, 256);
+                                                &threshold_block);
-    if (free_blocks < free_threshold) {
+        threshold_block += free_threshold;
-        threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
-        threshold_cycle = CYCLE_LSN(tail_lsn);
        if (threshold_block >= log->l_logBBsize) {
-            threshold_block -= log->l_logBBsize;
+                threshold_block -= log->l_logBBsize;
-            threshold_cycle += 1;
+                threshold_cycle += 1;
        }
-        threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+        threshold_lsn = xlog_assign_lsn(threshold_cycle,
+                                        threshold_block);
+        /*
+         * Don't pass in an lsn greater than the lsn of the last
+         * log record known to be on disk. Use a snapshot of the last sync lsn
+         * so that it doesn't change between the compare and the set.
+         */
+        last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+        if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+                threshold_lsn = last_sync_lsn;
-        /* Don't pass in an lsn greater than the lsn of the last
+        /*
-         * log record known to be on disk.
+         * Get the transaction layer to kick the dirty buffers out to
+         * disk asynchronously. No point in trying to do this if
+         * the filesystem is shutting down.
         */
-        if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
+        if (!XLOG_FORCED_SHUTDOWN(log))
-            threshold_lsn = log->l_last_sync_lsn;
+                xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-    }
+}
-    spin_unlock(&log->l_grant_lock);
-    /*
-     * Get the transaction layer to kick the dirty buffers out to
-     * disk asynchronously. No point in trying to do this if
-     * the filesystem is shutting down.
-     */
-    if (threshold_lsn &&
-        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}       /* xlog_grant_push_ail */
 /*
 * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t		*log,
                 roundoff < BBTOB(1)));
        /* move grant heads by roundoff in sync */
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
-        xlog_grant_add_space(log, roundoff);
+        xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
-        spin_unlock(&log->l_grant_lock);
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
                                lowest_lsn = xlog_get_lowest_lsn(log);
                                if (lowest_lsn &&
                                    XFS_LSN_CMP(lowest_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
                                        iclog = iclog->ic_next;
                                        continue; /* Leave this iclog for
                                                   * another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
                                iclog->ic_state = XLOG_STATE_CALLBACK;
-                                spin_unlock(&log->l_icloglock);
-                                /* l_last_sync_lsn field protected by
+                                /*
-                                 * l_grant_lock. Don't worry about iclog's lsn.
+                                 * update the last_sync_lsn before we drop the
-                                 * No one else can be here except us.
+                                 * icloglock to ensure we are the only one that
+                                 * can update it.
                                 */
-                                spin_lock(&log->l_grant_lock);
+                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-                                ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
+                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+                                atomic64_set(&log->l_last_sync_lsn,
-                                log->l_last_sync_lsn =
+                                        be64_to_cpu(iclog->ic_header.h_lsn));
-                                        be64_to_cpu(iclog->ic_header.h_lsn);
-                                spin_unlock(&log->l_grant_lock);
-                        } else {
+                        } else
-                                spin_unlock(&log->l_icloglock);
                                ioerrors++;
-                        }
+                        spin_unlock(&log->l_icloglock);
                        /*
                         * Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_force_wait);
+                        wake_up_all(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
        spin_unlock(&log->l_icloglock);
        if (wake)
-                sv_broadcast(&log->l_flush_wait);
+                wake_up_all(&log->l_flush_wait);
 }
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_write_wait);
+        wake_up_all(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
                XFS_STATS_INC(xs_log_noiclogs);
                /* Wait for log writes to have flushed */
-                sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+                xlog_wait(&log->l_flush_wait, &log->l_icloglock);
                goto restart;
        }
@@ -2527,6 +2486,18 @@ restart:
 *
 * Once a ticket gets put onto the reserveq, it will only return after
 * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
 */
 STATIC int
 xlog_grant_log_space(xlog_t        *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t	   *log,
 {
        int              free_bytes;
        int              need_bytes;
-#ifdef DEBUG
-        xfs_lsn_t        tail_lsn;
-#endif
 #ifdef DEBUG
        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                panic("grant Recovery problem");
 #endif
-        /* Is there space or do we need to sleep? */
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_grant_enter(log, tic);
+        need_bytes = tic->t_unit_res;
+        if (tic->t_flags & XFS_LOG_PERM_RESERV)
+                need_bytes *= tic->t_ocnt;
        /* something is already sleeping; insert new transaction at end */
-        if (log->l_reserve_headq) {
+        if (!list_empty_careful(&log->l_reserveq)) {
-                xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                spin_lock(&log->l_grant_reserve_lock);
+                /* recheck the queue now we are locked */
+                if (list_empty(&log->l_reserveq)) {
+                        spin_unlock(&log->l_grant_reserve_lock);
+                        goto redo;
+                }
+                list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep1(log, tic);
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
                 */
                trace_xfs_log_grant_wake1(log, tic);
-                spin_lock(&log->l_grant_lock);
        }
-        if (tic->t_flags & XFS_LOG_PERM_RESERV)
-                need_bytes = tic->t_unit_res*tic->t_ocnt;
-        else
-                need_bytes = tic->t_unit_res;
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                                     log->l_grant_reserve_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_reserve_lock);
-                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                if (list_empty(&tic->t_queue))
+                        list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep2(log, tic);
-                spin_unlock(&log->l_grant_lock);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                trace_xfs_log_grant_wake2(log, tic);
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space(log, need_bytes);
+                spin_lock(&log->l_grant_reserve_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_reserve_lock);
-        /*
-         * Check to make sure the grant write head didn't just over lap the
-         * tail.  If the cycles are the same, we can't be overlapping.
-         * Otherwise, make sure that the cycles differ by exactly one and
-         * check the byte count.
-         */
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
-        xlog_verify_grant_head(log, 1);
+        xlog_verify_grant_tail(log);
-        spin_unlock(&log->l_grant_lock);
        return 0;
- error_return:
+error_return_unlocked:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        spin_lock(&log->l_grant_reserve_lock);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+error_return:
+        list_del_init(&tic->t_queue);
+        spin_unlock(&log->l_grant_reserve_lock);
        trace_xfs_log_grant_error(log, tic);
        /*
@@ -2638,7 +2597,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_grant_log_space */
@@ -2646,17 +2604,14 @@ redo:
 /*
 * Replenish the byte reservation required by moving the grant write head.
 *
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
 */
 STATIC int
 xlog_regrant_write_log_space(xlog_t        *log,
                             xlog_ticket_t *tic)
 {
        int             free_bytes, need_bytes;
-        xlog_ticket_t   *ntic;
-#ifdef DEBUG
-        xfs_lsn_t       tail_lsn;
-#endif
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                panic("regrant Recovery problem");
 #endif
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_regrant_write_enter(log, tic);
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
        /* If there are other waiters on the queue then give them a
         * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t	   *log,
         * this transaction.
         */
        need_bytes = tic->t_unit_res;
-        if ((ntic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
-                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+                struct xlog_ticket *ntic;
-                                             log->l_grant_write_bytes);
-                do {
+                spin_lock(&log->l_grant_write_lock);
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+                list_for_each_entry(ntic, &log->l_writeq, t_queue) {
                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_wait);
+                        wake_up(&ntic->t_wait);
-                        ntic = ntic->t_next;
+                }
-                } while (ntic != log->l_write_headq);
-                if (ntic != log->l_write_headq) {
-                        if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                                xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (ntic != list_first_entry(&log->l_writeq,
+                                                struct xlog_ticket, t_queue)) {
+                        if (list_empty(&tic->t_queue))
+                                list_add_tail(&tic->t_queue, &log->l_writeq);
                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                        spin_unlock(&log->l_grant_lock);
+                        xlog_grant_push_ail(log, need_bytes);
-                        xlog_grant_push_ail(log->l_mp, need_bytes);
-                        spin_lock(&log->l_grant_lock);
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
+                        xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                                &log->l_grant_lock, s);
-                        /* If we're shutting down, this tic is already
-                         * off the queue */
-                        spin_lock(&log->l_grant_lock);
-                        if (XLOG_FORCED_SHUTDOWN(log))
-                                goto error_return;
                        trace_xfs_log_regrant_write_wake1(log, tic);
-                }
+                } else
+                        spin_unlock(&log->l_grant_write_lock);
        }
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                                     log->l_grant_write_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_write_lock);
-                        xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (list_empty(&tic->t_queue))
-                spin_unlock(&log->l_grant_lock);
+                        list_add_tail(&tic->t_queue, &log->l_writeq);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep2(log, tic);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                /* If we're shutting down, this tic is already off the queue */
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep2(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
                trace_xfs_log_regrant_write_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_write_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space_write(log, need_bytes);
+                spin_lock(&log->l_grant_write_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_write_lock);
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
+        xlog_verify_grant_tail(log);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        return 0;
+ error_return_unlocked:
+        spin_lock(&log->l_grant_write_lock);
 error_return:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        list_del_init(&tic->t_queue);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+        spin_unlock(&log->l_grant_write_lock);
        trace_xfs_log_regrant_write_error(log, tic);
        /*
@@ -2778,7 +2714,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_regrant_write_log_space */
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head,
-        xlog_grant_sub_space(log, ticket->t_curr_res);
+                                        ticket->t_curr_res);
+        xlog_grant_sub_space(log, &log->l_grant_write_head,
+                                        ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
        trace_xfs_log_regrant_reserve_sub(log, ticket);
-        xlog_verify_grant_head(log, 1);
        /* just return if we still have some of the pre-reserved space */
-        if (ticket->t_cnt > 0) {
+        if (ticket->t_cnt > 0)
-                spin_unlock(&log->l_grant_lock);
                return;
-        }
-        xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head,
+                                        ticket->t_unit_res);
        trace_xfs_log_regrant_reserve_exit(log, ticket);
-        xlog_verify_grant_head(log, 0);
-        spin_unlock(&log->l_grant_lock);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 }       /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
 xlog_ungrant_log_space(xlog_t        *log,
                       xlog_ticket_t *ticket)
 {
+        int     bytes;
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_ungrant_enter(log, ticket);
-        xlog_grant_sub_space(log, ticket->t_curr_res);
        trace_xfs_log_ungrant_sub(log, ticket);
-        /* If this is a permanent reservation ticket, we may be able to free
+        /*
+         * If this is a permanent reservation ticket, we may be able to free
         * up more space based on the remaining count.
         */
+        bytes = ticket->t_curr_res;
        if (ticket->t_cnt > 0) {
                ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-                xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+                bytes += ticket->t_unit_res*ticket->t_cnt;
        }
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+        xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
        trace_xfs_log_ungrant_exit(log, ticket);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        xfs_log_move_tail(log->l_mp, 1);
 }       /* xlog_ungrant_log_space */
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
                /* update tail before writing to iclog */
-                xlog_assign_tail_lsn(log->l_mp);
+                xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
-                iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
+                iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
-                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+                xlog_verify_tail_lsn(log, iclog, tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+                xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
                                XFS_STATS_INC(xs_log_force_sleep);
-                                sv_wait(&iclog->ic_prev->ic_write_wait,
+                                xlog_wait(&iclog->ic_prev->ic_write_wait,
-                                        PSWP, &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                                if (log_flushed)
                                        *log_flushed = 1;
                                already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
                                return XFS_ERROR(EIO);
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+                        xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                        /*
                         * No need to grab the log lock here since we're
                         * only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
        xlog_ticket_t   *ticket)
 {
        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        if (atomic_dec_and_test(&ticket->t_ref)) {
+        if (atomic_dec_and_test(&ticket->t_ref))
-                sv_destroy(&ticket->t_wait);
                kmem_zone_free(xfs_log_ticket_zone, ticket);
-        }
 }
 xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
        }
        atomic_set(&tic->t_ref, 1);
+        INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+        init_waitqueue_head(&tic->t_wait);
        xlog_tic_reset_res(tic);
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
 }
 STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
+        struct log      *log)
 {
-    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+        int             tail_cycle, tail_blocks;
-        if (equals)
+        int             cycle, space;
-            ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
-        else
+        /*
-            ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+         * Check to make sure the grant write head didn't just over lap the
-    } else {
+         * tail.  If the cycles are the same, we can't be overlapping.
-        ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+         * Otherwise, make sure that the cycles differ by exactly one and
-        ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+         * check the byte count.
-    }
+         */
-}       /* xlog_verify_grant_head */
+        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+        if (tail_cycle != cycle) {
+                ASSERT(cycle - 1 == tail_cycle);
+                ASSERT(space <= BBTOB(tail_blocks));
+        }
+}
 /* check if it will fit */
 STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
                xlog_cil_force(log);
        /*
-         * We must hold both the GRANT lock and the LOG lock,
+         * mark the filesystem and the as in a shutdown state and wake
-         * before we mark the filesystem SHUTDOWN and wake
+         * everybody up to tell them the bad news.
-         * everybody up to tell the bad news.
         */
        spin_lock(&log->l_icloglock);
-        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        if (mp->m_sb_bp)
                XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
        spin_unlock(&log->l_icloglock);
        /*
-         * We don't want anybody waiting for log reservations
+         * We don't want anybody waiting for log reservations after this. That
-         * after this. That means we have to wake up everybody
+         * means we have to wake up everybody queued up on reserveq as well as
-         * queued up on reserve_headq as well as write_headq.
+         * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
-         * In addition, we make sure in xlog_{re}grant_log_space
+         * we don't enqueue anything once the SHUTDOWN flag is set, and this
-         * that we don't enqueue anything once the SHUTDOWN flag
+         * action is protected by the grant locks.
-         * is set, and this action is protected by the GRANTLOCK.
         */
-        if ((tic = log->l_reserve_headq)) {
+        spin_lock(&log->l_grant_reserve_lock);
-                do {
+        list_for_each_entry(tic, &log->l_reserveq, t_queue)
-                        sv_signal(&tic->t_wait);
+                wake_up(&tic->t_wait);
-                        tic = tic->t_next;
+        spin_unlock(&log->l_grant_reserve_lock);
-                } while (tic != log->l_reserve_headq);
-        }
+        spin_lock(&log->l_grant_write_lock);
+        list_for_each_entry(tic, &log->l_writeq, t_queue)
-        if ((tic = log->l_write_headq)) {
+                wake_up(&tic->t_wait);
-                do {
+        spin_unlock(&log->l_grant_write_lock);
-                        sv_signal(&tic->t_wait);
-                        tic = tic->t_next;
-                } while (tic != log->l_write_headq);
-        }
-        spin_unlock(&log->l_grant_lock);
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97b..9dc8125d04e5 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
        init_rwsem(&cil->xc_ctx_lock);
-        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        init_waitqueue_head(&cil->xc_commit_wait);
        INIT_LIST_HEAD(&ctx->committing);
        INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-        struct xfs_log_vec      *lv;
-        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
        struct xfs_busy_extent  *busyp, *n;
-        /* unpin all the log items */
+        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
-        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                                        ctx->start_lsn, abort);
-                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
-                                                        abortflag);
-        }
        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -568,7 +563,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
        }
@@ -592,7 +587,7 @@ restart:
         */
        spin_lock(&cil->xc_cil_lock);
        ctx->commit_lsn = commit_lsn;
-        sv_broadcast(&cil->xc_commit_wait);
+        wake_up_all(&cil->xc_commit_wait);
        spin_unlock(&cil->xc_cil_lock);
        /* release the hounds! */
@@ -757,7 +752,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
                if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..d5f8be8f4bf6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
-struct xfs_buf_cancel;
 struct xfs_mount;
 /*
@@ -54,7 +53,6 @@ struct xfs_mount;
        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
        return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
 */
 #define XLOG_TIC_INITED         0x1     /* has been initialized */
 #define XLOG_TIC_PERM_RESERV    0x2     /* permanent reservation */
-#define XLOG_TIC_IN_Q           0x4
 #define XLOG_TIC_FLAGS \
        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
-        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
+        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
-        { XLOG_TIC_IN_Q,        "XLOG_TIC_IN_Q" }
 #endif  /* __KERNEL__ */
@@ -244,9 +240,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_wait;       /* ticket wait queue            : 20 */
+        wait_queue_head_t  t_wait;       /* ticket wait queue */
-        struct xlog_ticket *t_next;      /*                              :4|8 */
+        struct list_head   t_queue;      /* reserve/write queue */
-        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_in_core {
-        sv_t                    ic_force_wait;
+        wait_queue_head_t       ic_force_wait;
-        sv_t                    ic_write_wait;
+        wait_queue_head_t       ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
        struct xfs_cil_ctx      *xc_ctx;
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
-        sv_t                    xc_commit_wait;
+        wait_queue_head_t       xc_commit_wait;
        xfs_lsn_t               xc_current_sequence;
 };
@@ -491,7 +486,7 @@ typedef struct log {
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-        struct xfs_buf_cancel   **l_buf_cancel_table;
+        struct list_head        *l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sv_t                    l_flush_wait ____cacheline_aligned_in_smp;
+        wait_queue_head_t       l_flush_wait ____cacheline_aligned_in_smp;
                                                /* waiting for iclog flush */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
-        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
-                                                 * buffers */
-        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
-        /* The following block of fields are changed while holding grant_lock */
+        /*
-        spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
+         * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
-        xlog_ticket_t           *l_reserve_headq;
+         * read without needing to hold specific locks. To avoid operations
-        xlog_ticket_t           *l_write_headq;
+         * contending with other hot objects, place each of them on a separate
-        int                     l_grant_reserve_cycle;
+         * cacheline.
-        int                     l_grant_reserve_bytes;
+         */
-        int                     l_grant_write_cycle;
+        /* lsn of last LR on disk */
-        int                     l_grant_write_bytes;
+        atomic64_t              l_last_sync_lsn ____cacheline_aligned_in_smp;
+        /* lsn of 1st LR with unflushed * buffers */
+        atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
+        /*
+         * ticket grant locks, queues and accounting have their own cachlines
+         * as these are quite hot and can be operated on concurrently.
+         */
+        spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_reserveq;
+        atomic64_t              l_grant_reserve_head;
+        spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_writeq;
+        atomic64_t              l_grant_write_head;
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
 } xlog_t;
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+        ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
@@ -562,6 +571,61 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
                                xlog_in_core_t **commit_iclog, uint flags);
 /*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to smaple and crack LSNs taht are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+        xfs_lsn_t val = atomic64_read(lsn);
+        *cycle = CYCLE_LSN(val);
+        *block = BLOCK_LSN(val);
+}
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+        atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+        *cycle = val >> 32;
+        *space = val & 0xffffffff;
+}
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+        xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+        return ((int64_t)cycle << 32) | space;
+}
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+        atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+/*
 * Committed Item List interfaces
 */
 int     xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
 */
 #define XLOG_UNMOUNT_REC_TYPE   (-1U)
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue_exclusive(wq, &wait);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        spin_unlock(lock);
+        schedule();
+        remove_wait_queue(wq, &wait);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458c..204d8e5fa7fa 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #endif
 /*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+        xfs_daddr_t             bc_blkno;
+        uint                    bc_len;
+        int                     bc_refcount;
+        struct list_head        bc_list;
+};
+/*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
@@ -925,12 +936,12 @@ xlog_find_tail(
        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
        if (found == 2)
                log->l_curr_cycle++;
-        log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
+        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-        log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
+        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-        log->l_grant_reserve_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
-        log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
-        log->l_grant_write_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
-        log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
        /*
         * Look for unmount record.  If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
        }
        after_umount_blk = (i + hblks + (int)
                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-        tail_lsn = log->l_tail_lsn;
+        tail_lsn = atomic64_read(&log->l_tail_lsn);
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
                         * log records will point recovery to after the
                         * current unmount record.
                         */
-                        log->l_tail_lsn =
+                        xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                xlog_assign_lsn(log->l_curr_cycle,
+                                        log->l_curr_cycle, after_umount_blk);
-                                                after_umount_blk);
+                        xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                        log->l_last_sync_lsn =
+                                        log->l_curr_cycle, after_umount_blk);
-                                xlog_assign_lsn(log->l_curr_cycle,
-                                                after_umount_blk);
                        *tail_blk = after_umount_blk;
                        /*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
 * record in the table to tell us how many times we expect to see this
 * record during the second pass.
 */
-STATIC void
+STATIC int
-xlog_recover_do_buffer_pass1(
+xlog_recover_buffer_pass1(
-        xlog_t                  *log,
+        struct log              *log,
-        xfs_buf_log_format_t    *buf_f)
+        xlog_recover_item_t     *item)
 {
-        xfs_buf_cancel_t        *bcp;
+        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_buf_cancel_t        *nextp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
-        xfs_daddr_t             blkno = 0;
-        uint                    len = 0;
-        ushort                  flags = 0;
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        }
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLF_CANCEL)) {
+        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-                return;
+                return 0;
-        }
-        /*
-         * Insert an xfs_buf_cancel record into the hash table of
-         * them.  If there is already an identical record, bump
-         * its reference count.
-         */
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        /*
-         * If the hash bucket is empty then just insert a new record into
-         * the bucket.
-         */
-        if (*bucket == NULL) {
-                bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-                                                     KM_SLEEP);
-                bcp->bc_blkno = blkno;
-                bcp->bc_len = len;
-                bcp->bc_refcount = 1;
-                bcp->bc_next = NULL;
-                *bucket = bcp;
-                return;
        }
        /*
-         * The hash bucket is not empty, so search for duplicates of our
+         * Insert an xfs_buf_cancel record into the hash table of them.
-         * record.  If we find one them just bump its refcount.  If not
+         * If there is already an identical record, bump its reference count.
-         * then add us at the end of the list.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
-        nextp = *bucket;
+        list_for_each_entry(bcp, bucket, bc_list) {
-        while (nextp != NULL) {
+                if (bcp->bc_blkno == buf_f->blf_blkno &&
-                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+                    bcp->bc_len == buf_f->blf_len) {
-                        nextp->bc_refcount++;
+                        bcp->bc_refcount++;
                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-                        return;
+                        return 0;
                }
-                prevp = nextp;
+        }
-                nextp = nextp->bc_next;
-        }
+        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
-        ASSERT(prevp != NULL);
+        bcp->bc_blkno = buf_f->blf_blkno;
-        bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+        bcp->bc_len = buf_f->blf_len;
-                                             KM_SLEEP);
-        bcp->bc_blkno = blkno;
-        bcp->bc_len = len;
        bcp->bc_refcount = 1;
-        bcp->bc_next = NULL;
+        list_add_tail(&bcp->bc_list, bucket);
-        prevp->bc_next = bcp;
        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+        return 0;
 }
 /*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
 */
 STATIC int
 xlog_check_buffer_cancelled(
-        xlog_t                  *log,
+        struct log              *log,
        xfs_daddr_t             blkno,
        uint                    len,
        ushort                  flags)
 {
-        xfs_buf_cancel_t        *bcp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
        if (log->l_buf_cancel_table == NULL) {
                /*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
                return 0;
        }
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        bcp = *bucket;
-        if (bcp == NULL) {
-                /*
-                 * There is no corresponding entry in the table built
-                 * in pass one, so this buffer has not been cancelled.
-                 */
-                ASSERT(!(flags & XFS_BLF_CANCEL));
-                return 0;
-        }
        /*
-         * Search for an entry in the buffer cancel table that
+         * Search for an entry in the  cancel table that matches our buffer.
-         * matches our buffer.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
-        while (bcp != NULL) {
+        list_for_each_entry(bcp, bucket, bc_list) {
-                if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                        /*
+                        goto found;
-                         * We've go a match, so return 1 so that the
-                         * recovery of this buffer is cancelled.
-                         * If this buffer is actually a buffer cancel
-                         * log item, then decrement the refcount on the
-                         * one in the table and remove it if this is the
-                         * last reference.
-                         */
-                        if (flags & XFS_BLF_CANCEL) {
-                                bcp->bc_refcount--;
-                                if (bcp->bc_refcount == 0) {
-                                        if (prevp == NULL) {
-                                                *bucket = bcp->bc_next;
-                                        } else {
-                                                prevp->bc_next = bcp->bc_next;
-                                        }
-                                        kmem_free(bcp);
-                                }
-                        }
-                        return 1;
-                }
-                prevp = bcp;
-                bcp = bcp->bc_next;
        }
        /*
-         * We didn't find a corresponding entry in the table, so
+         * We didn't find a corresponding entry in the table, so return 0 so
-         * return 0 so that the buffer is NOT cancelled.
+         * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
-}
-STATIC int
+found:
-xlog_recover_do_buffer_pass2(
+        /*
-        xlog_t                  *log,
+         * We've go a match, so return 1 so that the recovery of this buffer
-        xfs_buf_log_format_t    *buf_f)
+         * is cancelled.  If this buffer is actually a buffer cancel log
-{
+         * item, then decrement the refcount on the one in the table and
-        xfs_daddr_t             blkno = 0;
+         * remove it if this is the last reference.
-        ushort                  flags = 0;
+         */
-        uint                    len = 0;
+        if (flags & XFS_BLF_CANCEL) {
+                if (--bcp->bc_refcount == 0) {
-        switch (buf_f->blf_type) {
+                        list_del(&bcp->bc_list);
-        case XFS_LI_BUF:
+                        kmem_free(bcp);
-                blkno = buf_f->blf_blkno;
+                }
-                flags = buf_f->blf_flags;
-                len = buf_f->blf_len;
-                break;
        }
+        return 1;
-        return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers,
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
- * the only data which should be recovered is that which corresponds
+ * data which should be recovered is that which corresponds to the
- * to the di_next_unlinked pointers in the on disk inode structures.
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
- * The rest of the data for the inodes is always logged through the
+ * data for the inodes is always logged through the inodes themselves rather
- * inodes themselves rather than the inode buffer and is recovered
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
- * in xlog_recover_do_inode_trans().
 *
- * The only time when buffers full of inodes are fully recovered is
+ * The only time when buffers full of inodes are fully recovered is when the
- * when the buffer is full of newly allocated inodes.  In this case
+ * buffer is full of newly allocated inodes.  In this case the buffer will
- * the buffer will not be marked as an inode buffer and so will be
+ * not be marked as an inode buffer and so will be sent to
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * xlog_recover_do_reg_buffer() below during recovery.
 */
 STATIC int
 xlog_recover_do_inode_buffer(
-        xfs_mount_t             *mp,
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
-        int                     item_index;
+        int                     item_index = 0;
-        int                     bit;
+        int                     bit = 0;
-        int                     nbits;
+        int                     nbits = 0;
-        int                     reg_buf_offset;
+        int                     reg_buf_offset = 0;
-        int                     reg_buf_bytes;
+        int                     reg_buf_bytes = 0;
        int                     next_unlinked_offset;
        int                     inodes_per_buf;
        xfs_agino_t             *logged_nextp;
        xfs_agino_t             *buffer_nextp;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
-        /*
-         * Set the variables corresponding to the current region to
-         * 0 so that we'll initialize them on the first pass through
-         * the loop.
-         */
-        reg_buf_offset = 0;
-        reg_buf_bytes = 0;
-        bit = 0;
-        nbits = 0;
-        item_index = 0;
        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
        for (i = 0; i < inodes_per_buf; i++) {
                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
                         * the current di_next_unlinked field.
                         */
                        bit += nbits;
-                        bit = xfs_next_bit(data_map, map_size, bit);
+                        bit = xfs_next_bit(buf_f->blf_data_map,
+                                           buf_f->blf_map_size, bit);
                        /*
                         * If there are no more logged regions in the
                         * buffer, then we're done.
                         */
-                        if (bit == -1) {
+                        if (bit == -1)
                                return 0;
-                        }
-                        nbits = xfs_contig_bits(data_map, map_size,
+                        nbits = xfs_contig_bits(buf_f->blf_data_map,
-                                                         bit);
+                                                buf_f->blf_map_size, bit);
                        ASSERT(nbits > 0);
                        reg_buf_offset = bit << XFS_BLF_SHIFT;
                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
                 * di_next_unlinked field, then move on to the next
                 * di_next_unlinked field.
                 */
-                if (next_unlinked_offset < reg_buf_offset) {
+                if (next_unlinked_offset < reg_buf_offset)
                        continue;
-                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
 * given buffer.  The bitmap in the buf log format structure indicates
 * where to place the logged data.
 */
-/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
        int                     bit;
        int                     nbits;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        int                     error;
        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
        bit = 0;
        i = 1;  /* 0 is the buf format structure */
        while (1) {
-                bit = xfs_next_bit(data_map, map_size, bit);
+                bit = xfs_next_bit(buf_f->blf_data_map,
+                                   buf_f->blf_map_size, bit);
                if (bit == -1)
                        break;
-                nbits = xfs_contig_bits(data_map, map_size, bit);
+                nbits = xfs_contig_bits(buf_f->blf_data_map,
+                                        buf_f->blf_map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
 * for more details on the implementation of the table of cancel records.
 */
 STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
-        int                     cancel;
-        xfs_daddr_t             blkno;
-        int                     len;
-        ushort                  flags;
        uint                    buf_flags;
-        if (pass == XLOG_RECOVER_PASS1) {
+        /*
-                /*
+         * In this pass we only want to recover all the buffers which have
-                 * In this pass we're only looking for buf items
+         * not been cancelled and are not cancellation buffers themselves.
-                 * with the XFS_BLF_CANCEL bit set.
+         */
-                 */
+        if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
-                xlog_recover_do_buffer_pass1(log, buf_f);
+                        buf_f->blf_len, buf_f->blf_flags)) {
+                trace_xfs_log_recover_buf_cancel(log, buf_f);
                return 0;
-        } else {
-                /*
-                 * In this pass we want to recover all the buffers
-                 * which have not been cancelled and are not
-                 * cancellation buffers themselves.  The routine
-                 * we call here will tell us whether or not to
-                 * continue with the replay of this buffer.
-                 */
-                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
-                if (cancel) {
-                        trace_xfs_log_recover_buf_cancel(log, buf_f);
-                        return 0;
-                }
        }
        trace_xfs_log_recover_buf_recover(log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        default:
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
-                        "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
-                        buf_f->blf_type, log->l_mp->m_logname ?
-                        log->l_mp->m_logname : "internal");
-                XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
-                                 XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLF_INODE_BUF))
+        if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+                          buf_flags);
        if (XFS_BUF_ISERROR(bp)) {
-                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+                xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-                                  bp, blkno);
+                                  bp, buf_f->blf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                return error;
        }
        error = 0;
-        if (flags & XFS_BLF_INODE_BUF) {
+        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-        } else if (flags &
+        } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
 }
 STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_inode_log_format_t  *in_f;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        xfs_dinode_t            *dip;
-        xfs_ino_t               ino;
        int                     len;
        xfs_caddr_t             src;
        xfs_caddr_t             dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
        xfs_icdinode_t          *dicp;
        int                     need_free = 0;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
                if (error)
                        goto error;
        }
-        ino = in_f->ilf_ino;
-        mp = log->l_mp;
        /*
         * Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
-                        dip, bp, ino);
+                        dip, bp, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                        item, ino);
+                        item, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                                item, dip, bp, ino);
+                                item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
-                        item, dip, bp, ino,
+                        item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
-                        item, dip, bp, ino, dicp->di_forkoff);
+                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
                        break;
                default:
-                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+                        xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2556,18 +2422,11 @@ error:
 * of that type.
 */
 STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_qoff_logformat_t    *qoff_f;
+        xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
-        if (pass == XLOG_RECOVER_PASS2) {
-                return (0);
-        }
-        qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
        /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
 * Recover a dquot record
 */
 STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        struct xfs_disk_dquot   *ddq, *recddq;
        int                     error;
        xfs_dq_logformat_t      *dq_f;
        uint                    type;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
-        mp = log->l_mp;
        /*
         * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
        if ((error = xfs_qm_dqcheck(recddq,
                           dq_f->qlf_id,
                           0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans (log copy)"))) {
+                           "xlog_recover_dquot_pass2 (log copy)"))) {
                return XFS_ERROR(EIO);
        }
        ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
         * minimal initialization then.
         */
        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans")) {
+                           "xlog_recover_dquot_pass2")) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
 * LSN.
 */
 STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
        xlog_t                  *log,
        xlog_recover_item_t     *item,
-        xfs_lsn_t               lsn,
+        xfs_lsn_t               lsn)
-        int                     pass)
 {
        int                     error;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_efi_log_item_t      *efip;
        xfs_efi_log_format_t    *efi_formatp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        efi_formatp = item->ri_buf[0].i_addr;
-        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
                                         &(efip->efi_format)))) {
                xfs_efi_item_free(efip);
                return error;
        }
-        efip->efi_next_extent = efi_formatp->efi_nextents;
+        atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-        efip->efi_flags |= XFS_EFI_COMMITTED;
        spin_lock(&log->l_ailp->xa_lock);
        /*
         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
        return 0;
 }
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
 * efd format structure.  If we find it, we remove the efi from the
 * AIL and free it.
 */
-STATIC void
+STATIC int
-xlog_recover_do_efd_trans(
+xlog_recover_efd_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp = log->l_ailp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return;
-        }
        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
        }
        xfs_trans_ail_cursor_done(ailp, &cur);
        spin_unlock(&ailp->xa_lock);
-}
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
-        xlog_t                  *log,
-        xlog_recover_t          *trans,
-        int                     pass)
-{
-        int                     error = 0;
-        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(log, trans, pass);
-        if (error)
-                return error;
-        list_for_each_entry(item, &trans->r_itemq, ri_list) {
-                trace_xfs_log_recover_item_recover(log, trans, item, pass);
-                switch (ITEM_TYPE(item)) {
-                case XFS_LI_BUF:
-                        error = xlog_recover_do_buffer_trans(log, item, pass);
-                        break;
-                case XFS_LI_INODE:
-                        error = xlog_recover_do_inode_trans(log, item, pass);
-                        break;
-                case XFS_LI_EFI:
-                        error = xlog_recover_do_efi_trans(log, item,
-                                                          trans->r_lsn, pass);
-                        break;
-                case XFS_LI_EFD:
-                        xlog_recover_do_efd_trans(log, item, pass);
-                        error = 0;
-                        break;
-                case XFS_LI_DQUOT:
-                        error = xlog_recover_do_dquot_trans(log, item, pass);
-                        break;
-                case XFS_LI_QUOTAOFF:
-                        error = xlog_recover_do_quotaoff_trans(log, item,
-                                                               pass);
-                        break;
-                default:
-                        xlog_warn(
-        "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
-                        ASSERT(0);
-                        error = XFS_ERROR(EIO);
-                        break;
-                }
-                if (error)
-                        return error;
-        }
        return 0;
 }
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
 */
 STATIC void
 xlog_recover_free_trans(
-        xlog_recover_t          *trans)
+        struct xlog_recover     *trans)
 {
        xlog_recover_item_t     *item, *n;
        int                     i;
@@ -2871,17 +2657,95 @@ xlog_recover_free_trans(
 }
 STATIC int
+xlog_recover_commit_pass1(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass1(log, item);
+        case XFS_LI_QUOTAOFF:
+                return xlog_recover_quotaoff_pass1(log, item);
+        case XFS_LI_INODE:
+        case XFS_LI_EFI:
+        case XFS_LI_EFD:
+        case XFS_LI_DQUOT:
+                /* nothing to do in pass 1 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass1",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+STATIC int
+xlog_recover_commit_pass2(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass2(log, item);
+        case XFS_LI_INODE:
+                return xlog_recover_inode_pass2(log, item);
+        case XFS_LI_EFI:
+                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+        case XFS_LI_EFD:
+                return xlog_recover_efd_pass2(log, item);
+        case XFS_LI_DQUOT:
+                return xlog_recover_dquot_pass2(log, item);
+        case XFS_LI_QUOTAOFF:
+                /* nothing to do in pass2 */
+                return 0;
+        default:
+                xlog_warn(
+        "XFS: invalid item type (%d) xlog_recover_commit_pass2",
+                        ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
 xlog_recover_commit_trans(
-        xlog_t                  *log,
+        struct log              *log,
-        xlog_recover_t          *trans,
+        struct xlog_recover     *trans,
        int                     pass)
 {
-        int                     error;
+        int                     error = 0;
+        xlog_recover_item_t     *item;
        hlist_del(&trans->r_list);
-        if ((error = xlog_recover_do_trans(log, trans, pass)))
+        error = xlog_recover_reorder_trans(log, trans, pass);
+        if (error)
                return error;
-        xlog_recover_free_trans(trans);                 /* no error */
+        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                if (pass == XLOG_RECOVER_PASS1)
+                        error = xlog_recover_commit_pass1(log, trans, item);
+                else
+                        error = xlog_recover_commit_pass2(log, trans, item);
+                if (error)
+                        return error;
+        }
+        xlog_recover_free_trans(trans);
        return 0;
 }
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
-        ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+        ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
        /*
         * First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
                                         extp->ext_len);
        }
-        efip->efi_flags |= XFS_EFI_RECOVERED;
+        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
        error = xfs_trans_commit(tp, 0);
        return error;
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
                 * Skip EFIs that we've already processed.
                 */
                efip = (xfs_efi_log_item_t *)lip;
-                if (efip->efi_flags & XFS_EFI_RECOVERED) {
+                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
        xfs_daddr_t     head_blk,
        xfs_daddr_t     tail_blk)
 {
-        int             error;
+        int             error, i;
        ASSERT(head_blk != tail_blk);
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
         * First do a pass to find all of the cancelled buf log items.
         * Store them in the buf_cancel_table for use in the second pass.
         */
-        log->l_buf_cancel_table =
+        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-                (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+                                                 sizeof(struct list_head),
-                                                 sizeof(xfs_buf_cancel_t*),
                                                 KM_SLEEP);
+        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS1);
        if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
                int     i;
                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-                        ASSERT(log->l_buf_cancel_table[i] == NULL);
+                        ASSERT(list_empty(&log->l_buf_cancel_table[i]));
        }
 #endif  /* DEBUG */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa1c254..d447aef84bc3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -472,7 +472,7 @@ xfs_initialize_perag(
                        goto out_unwind;
                pag->pag_agno = index;
                pag->pag_mount = mp;
-                rwlock_init(&pag->pag_ici_lock);
+                spin_lock_init(&pag->pag_ici_lock);
                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                spin_lock_init(&pag->pag_buf_lock);
@@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 }
 /*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+        struct xfs_mount        *mp)
+{
+        int i;
+        for (i = 0; i < XFS_LOWSP_MAX; i++) {
+                __uint64_t space = mp->m_sb.sb_dblocks;
+                do_div(space, 100);
+                mp->m_low_space[i] = space * (i + 1);
+        }
+}
+/*
 * Set whether we're using inode alignment.
 */
 STATIC void
@@ -1196,6 +1214,9 @@ xfs_mountfs(
         */
        xfs_set_rw_sizes(mp);
+        /* set the low space thresholds for dynamic preallocation */
+        xfs_set_low_space_thresholds(mp);
        /*
         * Set the inode cluster size.
         * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b4980740..a62e8971539d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
        xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+        XFS_LOWSP_1_PCNT = 0,
+        XFS_LOWSP_2_PCNT,
+        XFS_LOWSP_3_PCNT,
+        XFS_LOWSP_4_PCNT,
+        XFS_LOWSP_5_PCNT,
+        XFS_LOWSP_MAX,
+};
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
+        int64_t                 m_low_space[XFS_LOWSP_MAX];
+                                                /* low free space thresholds */
 } xfs_mount_t;
 /*
@@ -379,6 +391,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
+extern void     xfs_set_low_space_thresholds(struct xfs_mount *);
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711e..f80a067a4658 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
 * they could be immediately flushed and we'd have to race with the flusher
 * trying to pull the item from the AIL as we add it.
 */
-void
+static void
 xfs_trans_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               commit_lsn,
@@ -1425,6 +1425,83 @@ xfs_trans_committed(
        xfs_trans_free(tp);
 }
+static inline void
+xfs_log_item_batch_insert(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     **log_items,
+        int                     nr_items,
+        xfs_lsn_t               commit_lsn)
+{
+        int     i;
+        spin_lock(&ailp->xa_lock);
+        /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+        xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+        for (i = 0; i < nr_items; i++)
+                IOP_UNPIN(log_items[i], 0);
+}
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ */
+void
+xfs_trans_committed_bulk(
+        struct xfs_ail          *ailp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               commit_lsn,
+        int                     aborted)
+{
+#define LOG_ITEM_BATCH_SIZE     32
+        struct xfs_log_item     *log_items[LOG_ITEM_BATCH_SIZE];
+        struct xfs_log_vec      *lv;
+        int                     i = 0;
+        /* unpin all the log items */
+        for (lv = log_vector; lv; lv = lv->lv_next ) {
+                struct xfs_log_item     *lip = lv->lv_item;
+                xfs_lsn_t               item_lsn;
+                if (aborted)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                item_lsn = IOP_COMMITTED(lip, commit_lsn);
+                /* item_lsn of -1 means the item was freed */
+                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                        continue;
+                if (item_lsn != commit_lsn) {
+                        /*
+                         * Not a bulk update option due to unusual item_lsn.
+                         * Push into AIL immediately, rechecking the lsn once
+                         * we have the ail lock. Then unpin the item.
+                         */
+                        spin_lock(&ailp->xa_lock);
+                        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+                                xfs_trans_ail_update(ailp, lip, item_lsn);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                        IOP_UNPIN(lip, 0);
+                        continue;
+                }
+                /* Item is a candidate for bulk AIL insert.  */
+                log_items[i++] = lv->lv_item;
+                if (i >= LOG_ITEM_BATCH_SIZE) {
+                        xfs_log_item_batch_insert(ailp, log_items,
+                                        LOG_ITEM_BATCH_SIZE, commit_lsn);
+                        i = 0;
+                }
+        }
+        /* make sure we insert the remainder! */
+        if (i)
+                xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
 /*
 * Called from the trans_commit code when we notice that
 * the filesystem is in the middle of a forced shutdown.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a86..c2042b736b81 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define XFS_ALLOC_BTREE_REF     2
 #define XFS_BMAP_BTREE_REF      2
 #define XFS_DIR_BTREE_REF       2
+#define XFS_INO_REF             2
 #define XFS_ATTR_BTREE_REF      1
-#define XFS_INO_REF             1
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..c5bbbc45db91 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
 STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
 STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
 /*
- * Update the position of the item in the AIL with the new
+ * xfs_trans_ail_update - bulk AIL insertion operation.
- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
+ *
- * it to its new position by removing it and re-adding it.
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
 *
- * Wakeup anyone with an lsn less than the item's lsn.  If the item
+ * This function takes the AIL lock once to execute the update operations on
- * we move in the AIL is the minimum one, update the tail lsn in the
+ * all the items in the array, and as such should not be called with the AIL
- * log manager.
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * To optimise the insert operation, we delete all the items from the AIL in
- * is dropped before returning.
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_update(
+xfs_trans_ail_update_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip,
+        struct xfs_log_item     **log_items,
-        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
+        int                     nr_items,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip = NULL;
+        xfs_log_item_t          *mlip;
-        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
+        LIST_HEAD(tmp);
        mlip = xfs_ail_min(ailp);
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        for (i = 0; i < nr_items; i++) {
-                dlip = xfs_ail_delete(ailp, lip);
+                struct xfs_log_item *lip = log_items[i];
-                ASSERT(dlip == lip);
+                if (lip->li_flags & XFS_LI_IN_AIL) {
-                xfs_trans_ail_cursor_clear(ailp, dlip);
+                        /* check if we really need to move the item */
-        } else {
+                        if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
-                lip->li_flags |= XFS_LI_IN_AIL;
+                                continue;
+                        xfs_ail_delete(ailp, lip);
+                        if (mlip == lip)
+                                mlip_changed = 1;
+                } else {
+                        lip->li_flags |= XFS_LI_IN_AIL;
+                }
+                lip->li_lsn = lsn;
+                list_add(&lip->li_ail, &tmp);
        }
-        lip->li_lsn = lsn;
+        xfs_ail_splice(ailp, &tmp, lsn);
-        xfs_ail_insert(ailp, lip);
-        if (mlip == dlip) {
+        if (!mlip_changed) {
-                mlip = xfs_ail_min(ailp);
-                /*
-                 * It is not safe to access mlip after the AIL lock is
-                 * dropped, so we must get a copy of li_lsn before we do
-                 * so.  This is especially important on 32-bit platforms
-                 * where accessing and updating 64-bit values like li_lsn
-                 * is not atomic.
-                 */
-                tail_lsn = mlip->li_lsn;
-                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-        } else {
                spin_unlock(&ailp->xa_lock);
+                return;
        }
+        /*
-}       /* xfs_trans_update_ail */
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip->li_lsn;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
- * Delete the given item from the AIL.  It must already be in
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
- * the AIL.
 *
- * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
- * we delete in the AIL is the minimum one, update the tail lsn in the
+ * removed from the AIL. The caller is already holding the AIL lock, and done
- * log manager.
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
 *
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
- * bump the AIL's generation count to indicate that the tree
+ * flag from the item and reset the item's lsn to 0. If we remove the first
- * has changed.
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
 *
- * This function must be called with the AIL lock held.  The lock
+ * This function will not drop the AIL lock until all items are removed from
- * is dropped before returning.
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
 */
 void
-xfs_trans_ail_delete(
+xfs_trans_ail_delete_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+        struct xfs_log_item     **log_items,
+        int                     nr_items) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        mlip = xfs_ail_min(ailp);
-                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(ailp, lip);
-                ASSERT(dlip == lip);
-                xfs_trans_ail_cursor_clear(ailp, dlip);
-                lip->li_flags &= ~XFS_LI_IN_AIL;
+        for (i = 0; i < nr_items; i++) {
-                lip->li_lsn = 0;
+                struct xfs_log_item *lip = log_items[i];
+                if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+                        struct xfs_mount        *mp = ailp->xa_mount;
-                if (mlip == dlip) {
-                        mlip = xfs_ail_min(ailp);
-                        /*
-                         * It is not safe to access mlip after the AIL lock
-                         * is dropped, so we must get a copy of li_lsn
-                         * before we do so.  This is especially important
-                         * on 32-bit platforms where accessing and updating
-                         * 64-bit values like li_lsn is not atomic.
-                         */
-                        tail_lsn = mlip ? mlip->li_lsn : 0;
-                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-                } else {
                        spin_unlock(&ailp->xa_lock);
+                        if (!XFS_FORCED_SHUTDOWN(mp)) {
+                                xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+                "%s: attempting to delete a log item that is not in the AIL",
+                                                __func__);
+                                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                        }
+                        return;
                }
+                xfs_ail_delete(ailp, lip);
+                lip->li_flags &= ~XFS_LI_IN_AIL;
+                lip->li_lsn = 0;
+                if (mlip == lip)
+                        mlip_changed = 1;
        }
-        else {
-                /*
-                 * If the file system is not being shutdown, we are in
-                 * serious trouble if we get to this stage.
-                 */
-                struct xfs_mount        *mp = ailp->xa_mount;
+        if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
-                if (!XFS_FORCED_SHUTDOWN(mp)) {
+                return;
-                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-                "%s: attempting to delete a log item that is not in the AIL",
-                                        __func__);
-                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                }
        }
-}
+        /*
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic. It is possible we've emptied the
+         * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip ? mlip->li_lsn : 0;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
 * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
 }
 /*
- * Insert the given log item into the AIL.
+ * splice the log item list into the AIL at the given LSN.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
 */
 STATIC void
-xfs_ail_insert(
+xfs_ail_splice(
        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
+        struct list_head *list,
-/* ARGSUSED */
+        xfs_lsn_t       lsn)
 {
        xfs_log_item_t  *next_lip;
@@ -640,39 +660,33 @@ xfs_ail_insert(
         * If the list is empty, just insert the item.
         */
        if (list_empty(&ailp->xa_ail)) {
-                list_add(&lip->li_ail, &ailp->xa_ail);
+                list_splice(list, &ailp->xa_ail);
                return;
        }
        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-                if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
                        break;
        }
        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
+               (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
-        list_add(&lip->li_ail, &next_lip->li_ail);
-        xfs_ail_check(ailp, lip);
+        list_splice_init(list, &next_lip->li_ail);
        return;
 }
 /*
 * Delete the given item from the AIL.  Return a pointer to the item.
 */
-/*ARGSUSED*/
+STATIC void
-STATIC xfs_log_item_t *
 xfs_ail_delete(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        xfs_ail_check(ailp, lip);
        list_del(&lip->li_ail);
+        xfs_trans_ail_cursor_clear(ailp, lip);
-        return lip;
 }
 /*
@@ -682,7 +696,6 @@ xfs_ail_delete(
 STATIC xfs_log_item_t *
 xfs_ail_min(
        struct xfs_ail  *ailp)
-/* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
                return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
 xfs_ail_next(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        if (lip->li_ail.next == &ailp->xa_ail)
                return NULL;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-        next_extent = efip->efi_next_extent;
+        /*
+         * atomic_inc_return gives us the value after the increment;
+         * we want to use it as an array index so we need to subtract 1 from
+         * it.
+         */
+        next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
        extp = &(efip->efi_format.efi_extents[next_extent]);
        extp->ext_start = start_block;
        extp->ext_len = ext_len;
-        efip->efi_next_extent++;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..35162c238fa3 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
 void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
                                int flags);
-void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+void    xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+                                xfs_lsn_t commit_lsn, int aborted);
 /*
 * AIL traversal cursor.
 *
@@ -73,12 +75,29 @@ struct xfs_ail {
 /*
 * From xfs_trans_ail.c
 */
-void                    xfs_trans_ail_update(struct xfs_ail *ailp,
+void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
-                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
+                                struct xfs_log_item **log_items, int nr_items,
-                                        __releases(ailp->xa_lock);
+                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
-void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+static inline void
-                                        struct xfs_log_item *lip)
+xfs_trans_ail_update(
-                                        __releases(ailp->xa_lock);
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+void    xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+                                struct xfs_log_item **log_items, int nr_items)
+                                __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
 void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
 void                    xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151a..d8e6f8cd6f0c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
                        xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
        }
-        if (ip->i_d.di_nlink != 0) {
+        if (ip->i_d.di_nlink == 0)
-                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                return 0;
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                    (!(ip->i_d.di_flags &
-                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                        /*
+        if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                         * If we can't get the iolock just skip truncating
+             ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                         * the blocks past EOF because we could deadlock
+               ip->i_delayed_blks > 0)) &&
-                         * with the mmap_sem otherwise.  We'll get another
+             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                         * chance to drop them once the last reference to
+            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                         * the inode is dropped, so we'll never leak blocks
-                         * permanently.
-                         */
-                        error = xfs_free_eofblocks(mp, ip,
-                                                   XFS_FREE_EOF_TRYLOCK);
-                        if (error)
-                                return error;
-                }
-        }
+                /*
+                 * If we can't get the iolock just skip truncating the blocks
+                 * past EOF because we could deadlock with the mmap_sem
+                 * otherwise.  We'll get another chance to drop them once the
+                 * last reference to the inode is dropped, so we'll never leak
+                 * blocks permanently.
+                 *
+                 * Further, check if the inode is being opened, written and
+                 * closed frequently and we have delayed allocation blocks
+                 * oustanding (e.g. streaming writes from the NFS server),
+                 * truncating the blocks past EOF will cause fragmentation to
+                 * occur.
+                 *
+                 * In this case don't do the truncation, either, but we have to
+                 * be careful how we detect this case. Blocks beyond EOF show
+                 * up as i_delayed_blks even when the inode is clean, so we
+                 * need to truncate them away first before checking for a dirty
+                 * release. Hence on the first dirty close we will still remove
+                 * the speculative allocation, but after that we will leave it
+                 * in place.
+                 */
+                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                        return 0;
+                error = xfs_free_eofblocks(mp, ip,
+                                           XFS_FREE_EOF_TRYLOCK);
+                if (error)
+                        return error;
+                /* delalloc blocks after truncation means it really is dirty */
+                if (ip->i_delayed_blks)
+                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+        }
        return 0;
 }